My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
/*
* Copyright (c) 2008-2009, Computational Crawling LP
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* * Neither the name of Computational Crawling LP, 80legs, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.eightylegs.customer.job;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;

/**
* Class CustomerResults.
*
* This serializes and deserializes the customer results in 80legs
*
* A simple way to get the CustomerResults is to use the static function readFile().
* It should be called like this:
* HashMap<String,byte[]> results = CustomerResults.readFile ( fileName );
*
* The basic deserialize flow is as follows, but you can replace the BufferedInputStream
* with any InputStream if you want to read some other way (e.g. read from memory):
* BufferedInputStream r = new BufferedInputStream ( new FileInputStream ( fileName ) );
* CustomerResults results = new CustomerResults();
* try {
* results.startRead ( r );
* String url;
* while ( (url = results.readNextUrl(r)) != null ) {
* byte[] customerData = results.readNextData ( r );
*
* // do something with the url and customerData
* }
* }
* catch ( Exception e ) {
* // format error
* }
*
* For people interested in deserializing in other languages, the file format this
* creates and reads is:
* <classID><versionID><URL-SIZE><URL><DATA-SIZE><DATA>
* - the last 4 items (<URL-SIZE><URL><DATA-SIZE><DATA>) repeat for each url/data pair
* - <classID>, <versionID>, <URL-SIZE>, and <DATA-SIZE> are encoded 32-bit integers
* - The url is encoded using UTF-8.
*
*/
public class CustomerResults {
private static final int classID = 218217067;
private static final int maxVersionID = 1;
private static final String utfCharSet = "UTF-8";

private byte[] useBytes; // 4-byte convenience array
private int curVersionID;

public CustomerResults()
{
useBytes = new byte[4];
}

/**
* This method completely reads a CustomerResults file and returns the results as a
* HashMap<String,byte[]> with the URL in the String and the binary results in the byte[]
*
* Warning: The HashMap returned by this function can be quite large. If you want to be
* able to read a large file that cannot fit into memory, you can use the code below as
* a model. Just replace the HashMap stuff with your own code to process the results.
*
* Note that this function is static. It should be called like this:
* HashMap<String,byte[]> results = CustomerResults.readFile ( fileName );
*
* @param fileName This is the file to be read
*
* @return HashMap<String,byte[]> The url/byte[] pairs returned in a HashMap
*/
public static HashMap<String,byte[]> readFile(String fileName) throws Exception {
BufferedInputStream r = new BufferedInputStream(new FileInputStream(fileName));
CustomerResults customerResults = new CustomerResults();
HashMap<String,byte[]> resultsData = new HashMap<String,byte[]>();

try {
// loop through all of the results and add them to the HashMap
customerResults.startRead(r);
String url;
while ((url = customerResults.readNextUrl(r)) != null) {
byte[] customerData = customerResults.readNextData(r);

resultsData.put(url, customerData);
}
}
catch ( Exception e ) {
}
finally {
r.close();
}

return resultsData;
}

/**
* This method completely reads a CustomerResults from the input string and returns the results as a
* HashMap<String,byte[]> with the URL in the String and the binary results in the byte[]
*
* Warning: The HashMap returned by this function can be quite large. If you want to be
* able to read a large file that cannot fit into memory, you can use the code below as
* a model. Just replace the HashMap stuff with your own code to process the results.
*
* Note that this function is static. It should be called like this:
* HashMap<String,byte[]> results = CustomerResults.readFile ( fileName );
*
* @param inData This is the string to be read
*
* @return HashMap<String,byte[]> The url/byte[] pairs returned in a HashMap
*/
public static HashMap<String,byte[]> readString(String inData) throws Exception {
byte[] bytes = inData.getBytes("UTF-8");
ByteArrayInputStream s = new ByteArrayInputStream(bytes);
CustomerResults customerResults = new CustomerResults();
HashMap<String,byte[]> resultsData = new HashMap<String,byte[]>();

// loop through all of the results and add them to the HashMap
customerResults.startRead(s);
String url;
while ((url = customerResults.readNextUrl(s)) != null) {
byte[] customerData = customerResults.readNextData(s);

resultsData.put(url, customerData);
}

return resultsData;
}

/**
* This method initializes a new read from a CustomerResults file. It validates the classID and versionID.
*
* @param r This is a InputStream for the input of the data
*
*/
public void startRead( InputStream r ) throws Exception {
// read and validate the classID - this is used to partially validate that this is the right file type.
r.read(useBytes);
int curClassID = byteArrayToInt(useBytes, 0);
if ( curClassID != classID ) {
throw ( new Exception("Bad classID=" + curClassID + ", should be " + classID +
". This is probably not a valid CustomerResults file.") );
}

// read and validate the versionID - this is used internally to handle different versions of this file
r.read(useBytes);
curVersionID = byteArrayToInt(useBytes, 0);
if ( curVersionID > maxVersionID ) {
throw ( new Exception("Bad Version Code=" + curVersionID + ", maxVersionID=" + maxVersionID +
". This is either not a valid file or a newer version of CustomerResults.java is available on the 80legs website") );
}
}

/**
* This method initializes a new write to a new file. It writes the format classID and versionID.
*
* @param w This is a OutputStream for the output of the data
*
*/
public void startWrite( OutputStream w ) throws Exception {
intToByteArray(classID, useBytes, 0);
w.write(useBytes);

curVersionID = maxVersionID;
intToByteArray(curVersionID, useBytes, 0);
w.write(useBytes);
}

/**
* This method writes a single url/data pair to an output file
*
* @param w This is a OutputStream for the output of the data
* @param url The URL to be encoded
* @param customData The customerData to be encoded
*
* @return the size of this write
*/
public int writeResult ( OutputStream w, String url, byte[] customData ) throws UnsupportedEncodingException, IOException {

int totalBytes = 0;
totalBytes += writeBytesAndSize ( w, url.getBytes(utfCharSet) );
totalBytes += writeBytesAndSize ( w, customData );

return totalBytes;
}

/**
* This method reads a single url from a CustomerResults file
*
* @param r This is a InputStream for the input of the data
*
* @return the URL
*/
public String readNextUrl ( InputStream r ) throws Exception {
if ( curVersionID == 1 ) {
if ( r.available() < 4 )
return null;

byte[] urlBytes = readBytesAndSize ( r );
return new String ( urlBytes, 0, urlBytes.length, utfCharSet );
}
else {
throw ( new Exception("Unknown versionID=" + curVersionID) );
}
}

/**
* This method reads a single byte[] from a CustomerResults file
*
* @param r This is a InputStream for the input of the data
*
* @return the customer data as a byte[]
*/
public byte[] readNextData ( InputStream r ) throws Exception {
if ( curVersionID == 1 ) {
if ( r.available() < 4 )
return null;

return readBytesAndSize ( r );
}
else {
throw ( new Exception("Unknown versionID=" + curVersionID) );
}
}



/*
* private members below
*/


/*
* reads the size as a 32-bit integer then reads and returns that number of bytes
*/
private byte[] readBytesAndSize ( InputStream r ) throws Exception {
r.read(useBytes);
int size = byteArrayToInt(useBytes, 0);
if ( size < 0 ) {
throw ( new Exception ("Bad Size") );
}

byte[] b = new byte[size];
r.read(b);
return b;
}

/*
* writes the size as a 32-bit integer then writes the bytes
*/
private int writeBytesAndSize ( OutputStream w, byte[] b ) throws IOException {
intToByteArray(b.length, useBytes, 0);
w.write(useBytes);
w.write(b);

return useBytes.length + b.length;
}

/*
* convert an int to a byte array in little endian
*/
private static void intToByteArray(int i, byte[] b, int byteStart) {
b[byteStart+3] = (byte)((i>>24) & 0xFF);
b[byteStart+2] = (byte)((i>>16) & 0xFF);
b[byteStart+1] = (byte)((i>>8) & 0xFF);
b[byteStart+0] = (byte)(i & 0xFF);
}

/*
* convert an byte array to an int (assumes byte-array was little endian)
*/
private static int byteArrayToInt(byte[] b, int byteStart) {
return ((int)b[byteStart+3] << 24)
| (((int)b[byteStart+2] & 0xFF) << 16)
| (((int)b[byteStart+1] & 0xFF) << 8)
| ((int)b[byteStart+0] & 0xFF);
}
}

Change log

r42 by brad.wil...@80legs.com on Jul 27, 2009   Diff
Read and return all of the good results
Go to: 
Project members, sign in to write a code review

Older revisions

r40 by brad.wil...@80legs.com on Jun 12, 2009   Diff
Bug fix
r39 by brad.wil...@80legs.com on Jun 9, 2009   Diff
Make writeResult() return the size of
that was written
r20 by brad.wil...@80legs.com on May 15, 2009   Diff
Change to packaging and comments for
javadoc
All revisions of this file

File info

Size: 11002 bytes, 294 lines
Powered by Google Project Hosting