My favorites | Sign in
Project Hosting will be READ-ONLY Thursday at 3:00pm UTC for up to 3 hours for network maintenance.
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
#include <algorithm>
#include <fstream>
#include <iostream>
#include <iterator>
#include <ext/hash_map>
#include <cmath>
#include <sstream>
#include <stack>
#include <string>
#include <vector>
#include "u1.h"

/*
* Runs PageRank on the adjacency list passed on stdin.
* Outputs the rank vector on stdout and some progress info on stderr.
*
* USAGE:
* pagerank [-iinteresting_file_name] [-nE_norm] [-eepsilon] [-mmax_iterations]
* where
* interesting_file_name: Name of file containing the "E" vector (see below).
* If unspecified, a uniform E vector is used.
* E_norm: The norm of the "E" vector (see below) as a float.
* epsilon: Let R' and R be the rank vector on successive iterations; the
* algorithm terminates when L1_norm(R'-R) < epsilon.
* max_iterations: Terminate after this many iterations, even if epsilon
* convergence was not satisfied.
*
* The format for the adjacency list is:
* page_number N link_1 ... link_N
* where page_number starts at 0. (That is, non-negative integers delimited by
* whitespace.)
*
* The "interesting" file is a list of page numbers that are "interesting,"
* delimited by whitespace. If specified, we create an E vector in which each
* of the interesting pages has the same value and all other entries are zero.
* The L1 norm of the E vector can be specified using -n.
*
* If no "interesting" file is specified, a uniform E vector is used (every
* page is equally important, initially).
*
* The output format is: page_number R where R is the rank of page number
* page_number. R is a float. Pages are output in order from highest rank to
* lowest rank.
*
* See the Page and Brin paper for more info.
*/

using namespace __gnu_cxx; // for hash containers (not standard)
using namespace std;
const double DEFAULT_EPSILON = 0.001;
const int DEFAULT_MAX_ITERATIONS = 5000;
const double DEFAULT_E_NORM = 0.05;
const int DEFAULT_LINKS = 100000000; // upper bound to save on reallocs

typedef vector<pair<int,int> > edge_list_t;
typedef hash_multimap<int, int> adj_map_t;

/*
* Read edge list from given stream. Also keep track of the max page number
* (see comments in main). Also remove any self links; these cause serious rank
* sinks.
*/
void read_edge_list(istream& is, edge_list_t& edges, int& max_page_number) {
TRACE << "Reading pages from stdin..." << endl;
int page_count = 0;
int self_link_count = 0;
max_page_number = -1;
while (is.good()) {
int page_number, link_count, link;
is >> page_number >> link_count;
if (!is.good()) break;

if (page_number < 0) {
TRACE << "Negative page numbers not allowed (read " << page_count
<< " pages before error)." << endl;
exit(EXIT_FAILURE);
}

if (link_count < 0) {
TRACE << "Negative link count not allowed (read " << page_count
<< " pages before error)." << endl;
exit(EXIT_FAILURE);
}

for (int i = 0; i < link_count; ++i) {
is >> link;
if (!is.good()) {
TRACE << "Unexpected end of file on " << i << "th of " << link_count
<< " links when reading " << page_count << "th page)." << endl;
exit(EXIT_FAILURE);
}

if (link < 0) {
TRACE << "Negative link not allowed (read " << page_count
<< " pages before error)." << endl;
exit(EXIT_FAILURE);
}

// Ignore self links.
if (page_number == link) {
++(self_link_count);
} else {
edges.push_back(make_pair(page_number,link));
}
}

// For dimensioning vectors later.
if (page_number > max_page_number)
max_page_number = page_number;

// For giving better error messages.
++page_count;
}
TRACE << "Read " << page_count << " pages and " << edges.size()
<< " links (and " << self_link_count << " self links) from stdin." << endl;
}

/*
* Functor for updating out degrees.
*/
struct F_out_deg
{
vector<int>& out_deg;
F_out_deg(vector<int>& out_deg) : out_deg(out_deg) { }

void operator()(const pair<int, int>& edge) {
++(out_deg[edge.first]);
}
};

/*
* Functor for removing dangling links.
*/
struct F_undangle
{
vector<int>& out_deg;
F_undangle(vector<int>& out_deg) : out_deg(out_deg) { }

bool operator()(const pair<int, int>& edge) {
if (out_deg[edge.second] == 0) {
--(out_deg[edge.first]);
return false;
}
return true;
}
};

/*
* The L1 norm.
*/
double L1_norm(const vector<double>& v) {
double norm = 0.0;
for (vector<double>::const_iterator it = v.begin(); it != v.end(); ++it)
norm += fabs(*it);
return norm;
}

int main(int argc, char *argv[]) {
string interesting_file;
double E_norm = DEFAULT_E_NORM;
double epsilon = DEFAULT_EPSILON;
int max_iterations = DEFAULT_MAX_ITERATIONS;

/* Get arguments. */
for (int i = 1; i < argc; ++i) {
string arg = argv[i];
if (arg.size() < 2) {
TRACE << "Ignoring argument "<<arg<< ". See comments for usage."<<endl;
continue;
}
string flag = arg.substr(0,2);
if (flag == "-i") {
interesting_file = arg.substr(2);
} else if (flag == "-e") {
istringstream(arg.substr(2)) >> epsilon;
} else if (flag == "-m") {
istringstream(arg.substr(2)) >> max_iterations;
} else if (flag == "-n") {
istringstream(arg.substr(2)) >> E_norm;
} else {
TRACE << "Ignoring argument "<<arg<< ". See comments for usage."<<endl;
}
}

/*
* Get adjacency list from stdin. Track the maximum page number. We assume
* that the numbering is relatively dense, and so we use vectors instead of
* maps for the rank vectors R and E. We also use it to store the out
* degrees.
*
* For 10^8 edges, we need ~ 8*10^8B. For 10^7 nodes, we need ~ 8*10^7B. So
* this should be < 1GB total.
*/
edge_list_t forward;
int max_page_number;
forward.reserve(DEFAULT_LINKS);
read_edge_list(cin, forward, max_page_number);
TRACE << "Maximum page number is " << max_page_number << "." << endl;

/*
* Read in the E vector, if specified.
*/
vector<double> E(max_page_number + 1, 0);
if (interesting_file.size() > 0) {
TRACE << "Building E vector from " << interesting_file << "..." << endl;
ifstream fs(interesting_file.c_str());
int page_count = 0;
while (fs.good()) {
int page_number;
fs >> page_number;
if (fs.good()) {
if (page_number >= 0 && page_number < max_page_number) {
E[page_number] = 1;
++page_count;
} else {
TRACE << "Invalid page number "<<page_number<<"; ignoring."<<endl;
}
}
}
TRACE << "Read " << page_count << " E entries." << endl;
for (vector<double>::iterator it = E.begin(); it != E.end(); ++it) {
(*it) *= E_norm / page_count;
}
} else {
TRACE << "Using default E vector." << endl;
fill(E.begin(), E.end(), E_norm / (max_page_number + 1));
}
TRACE << "E vector norm is " << L1_norm(E) << "." << endl;

/*
* Cache out degrees for performance reasons. We actually need the out degree
* for two purposes. The first is to remove dangling links: links that link
* to a page with no out links are removed. This is because such pages
* accumulate rank from their backlinks but do not distribute it through
* forward links (because they have none). This exaggerates their importance.
* The second use for out degrees is deciding how important each link is; the
* links from a page all share that page's rank equally. We need both the
* "real" out degree (including dangling links) and the "undangled" out
* degree. We currently use the "real" out degree in the rank computation,
* which is slightly different from the paper.
*/
TRACE << "Counting out degrees... " << endl;
vector<int> out_deg(max_page_number + 1, 0);
for_each(forward.begin(), forward.end(), F_out_deg(out_deg));

/*
* Remove dangling links by scanning the edge list and removing any edge
* where the target node has out degree zero. Repeat a few times. Use
* partition to keep the dangling edges at the end and remember the
* partition points for when we have to put the dangling links back in.
*/
TRACE << "Removing dangling links..." << endl;
edge_list_t::iterator forward_end = forward.end();
edge_list_t::iterator new_forward_end;
stack<edge_list_t::iterator> partitions;
int links_removed;

// Note: we use a copy of the out_deg vector; see note above.
vector<int> out_deg_copy = out_deg;

// Note: pushing forward_end twice makes re-dangling easier.
partitions.push(forward_end);
do {
partitions.push(forward_end);

new_forward_end = partition(forward.begin(), forward_end,
F_undangle(out_deg_copy));

links_removed = (forward_end - new_forward_end);
TRACE << "Removed " << links_removed << " dangling links." << endl;

forward_end = new_forward_end;
} while(links_removed > 0);
TRACE << "Left with " << (forward_end - forward.begin()) << " links." << endl;

/*
* We need efficient access to the back links of each node. Scan the edge
* list into a multimap using the reversed pairs. This will be no larger
* than the forward map, so we're up to 2GB or so.
*/
TRACE << "Reversing links..." << endl;
adj_map_t back;
for (edge_list_t::const_iterator it = forward.begin();
it != forward_end; ++it) {
back.insert(make_pair(it->second, it->first));
}
TRACE << "Reversed " << back.size() << " links." << endl;

/*
* Now we're finally ready to compute the page ranks.
*/
int N = E.size();
vector<double> R[2], temp(N, 0);
int p = 1, n = 0; // prev and next indexes into R (avoid copying)
R[0] = E;
R[1].resize(N);
int iteration = 0;
double delta;
do {
swap(p, n);
fill(R[n].begin(), R[n].end(), 0);

TRACE << "On iteration " << iteration << ":" << endl;
TRACE << " ||Rp|| = " << L1_norm(R[p]) << endl;

for (adj_map_t::const_iterator it = back.begin();
it != back.end(); ++it) {
int u = it->first;
int v = it->second;
R[n][u] += R[p][v] / out_deg[v];
}
TRACE << " ||Rn|| = " << L1_norm(R[n]) << endl;

double d = L1_norm(R[p]) - L1_norm(R[n]);
TRACE << " d = " << d << endl;

// R[n] = R[n] + d*E;
for (int i = 0; i < N; ++i) {
R[n][i] = R[n][i] + d * E[i];
}

// delta = ||R[n] - R[p]||_1
for (int i = 0; i < N; ++i) {
temp[i] = R[n][i] - R[p][i];
}
delta = L1_norm(temp);
TRACE << " delta = " << delta << endl;

++iteration;
} while (iteration < max_iterations && delta > epsilon);

/*
* Now we have to deal with the dangling links. It seems sensible to say that
* the rank of a page with no forward links is the weighted rank conferred by
* its parents (ie. do one iteration of page rank w/o the E). To do this
* we have to add the undangled links back in the right order: add back all
* the links that have their source in the undangled graph. We add the links
* back by referring to the partition points from the undangling step.
*
* Start by zeroing the rank of all nodes that were targets of dangling
* links. All links to such a node would be dangling links, and would thus
* be removed, so such a node must have had no backlinks (in degree zero)
* during the computation. Any rank it accumulated is therefore due only
* to normalization, and we're going to renormalize anyway.
*/
TRACE << "Redangling " << (forward.end() - forward_end) << " links..."<<endl;
for (edge_list_t::iterator it = forward_end; it != forward.end(); ++it)
R[n][it->second] = 0;
while (partitions.size() > 1) {
edge_list_t::iterator it = partitions.top();
partitions.pop();
edge_list_t::iterator part_end = partitions.top();
for (; it != part_end; ++it) {
R[n][it->second] += R[n][it->first] / out_deg[it->first];
}
}

TRACE << "Renormalizing..." << endl;
double norm_const = L1_norm(R[n]);
for (int i = 0; i < N; ++i) {
R[n][i] /= norm_const;
}
TRACE << "Finished with L1 norm " << L1_norm(R[n]) << "." << endl;

/*
* Output sorted descending by rank.
*/
TRACE << "Sorting pages by rank..." << endl;
vector<int> pi(N);
for (int i = 0; i < N; ++i) pi[i] = i;
sort(pi.begin(), pi.end(), F_compare_perm<double>(R[n]));

TRACE << "Writing ranks..." << endl;
for (int i = N-1; i >= 0; --i) {
cout << pi[i] << " " << R[n][pi[i]] << endl;
}
TRACE << "Done." << endl;

return 0;
}
/*
* Copyright (c) 2009 John Lees-Miller
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

Change log

r2 by jdleesmiller on Jun 29, 2009   Diff
import
Go to: 
Project members, sign in to write a code review

Older revisions

All revisions of this file

File info

Size: 14015 bytes, 405 lines
Powered by Google Project Hosting