My favorites
|
Sign in
libfacebook
python libraries to access information on facebook.com
Project Home
Downloads
Wiki
Issues
Source
Checkout
|
Browse
|
Changes
|
r6
Source path:
svn
/
trunk
/
faceoff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
#!/usr/bin/env python
""" a module for scraping Facebook """
import os
import sys
import time
import re
import socket
import urllib2
import urlparse
import optparse
import getpass
import mechanize
import BeautifulSoup
__author__ = "Danny Colligan -- dannycolligan@gmail.com"
__version__ = "0.01"
# TODO
# - better error catching
# - catch urlerrors/IOErrors around everything that calls urlopen
# - contacts
# - figure out how to simulate ajax on everyone page
# - div id='friends_target' is present in the pre-ajax html
# - need to figure out what triggers population of page
# - can scrape first 400 contacts from html source (in script tag):
# 'Friends.dnd'
# - but how to do more than 400?
# - perhaps check out spidermonkey?
# - more robust scraping; don't rely on links that some people may not have,
# - go more url-based
# - optimizations
# - eliminate unnecessary requests
# - infer links (esp. jpgs) without parsing multiple pages
# - potential breakage if Facebook's naming conventions change
known_user_agents = {
'FF3Linux' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) '
'Gecko/2009032711 Ubuntu/8.10 (intrepid) Firefox/3.0.8',
'IE6' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; '
'.NET CLR 1.1.4322)',
'urllib2' : 'Python-urllib/2.5',
}
known_actions = {
'myphotos' : 'export_my_photos',
'photosofme' : 'export_photos_of_me',
'contacts' : 'export_my_contacts',
}
def _rm_rf(dirPath):
""" Same as the rm -rf <dirPath> command. """
ls = os.listdir(dirPath)
for name in ls:
path = os.path.join(dirPath, name)
if not os.path.isdir(path):
os.remove(path)
else:
_rm_rf(path)
os.rmdir(dirPath)
class FacebookScraper:
""" Exports information from Facebook. """
def __init__(self, options):
self.login_url = 'https://login.facebook.com/login.php'
self.email = options.email
self.password = options.password
self.sleep_time = options.sleep_time
self.br = mechanize.Browser()
if not known_user_agents[options.user_agent]:
raise StandardError, "Unknown user agent '%s'" % options.user_agent
self.user_agent = known_user_agents[options.user_agent]
self.br.addheaders = [('User-agent', self.user_agent)]
self.br.set_handle_robots(False)
self.logged_in = False
self.debug_level = options.debug_level
self.num_photos_downloaded = 0
self.photo_download_cap = options.photo_download_cap
self.num_contacts_downloaded = 0
self.contacts_download_cap = options.contacts_download_cap
self.download_root_path = options.download_root_path
self.overwrite_directories = options.overwrite_directories
self.resume_execution = options.resume_execution
self.start_at_album = options.start_at_album
if self.overwrite_directories and self.resume_execution:
raise StandardError, 'Can only use one of -o and -s flags'
self.time_of_last_request = time.time()
if self.debug_level == 2:
self.br.set_debug_http(True)
elif self.debug_level == 3:
self.br.set_debug_http(True)
self.br.set_debug_redirects(True)
def print_stats(self):
""" Prints statistics related to FacebookScraper activity. """
print "# photos downloaded: %s" % self.num_photos_downloaded
def _sleep(self):
""" Makes sure there is approximently self.sleep_time seconds between
each request. Call this function after any requests to Facebook.
Do this to rate limit to avoid account getting revoked /
being a bad web citizen by hammering a server with requests. """
time_since_last_request = time.time() - self.time_of_last_request
if time_since_last_request < self.sleep_time:
time.sleep(self.sleep_time - time_since_last_request)
self.time_of_last_request = time.time()
def _log_page(self, url=None):
""" Logs the page currently observed by self.br. """
if self.debug_level == 1:
if url != None:
print url
else:
print self.br.geturl()
def _open_and_sleep(self, url):
""" Opens a specified link and sleeps; use only when you want
to go to a url that is not linked off the current page. """
self.br.open(url=url)
self._log_page()
self._sleep()
def _follow_and_sleep(self, text_regex=None, url=None, nr=0):
""" Opens a specified link (that must be on the current page)
and sleeps. Use in preference to _open_and_sleep. """
if url != None:
while True:
try:
self.br.follow_link(url=url, nr=nr)
break
except urllib2.URLError, e: # time out, etc.
if self.debug_level > 0:
sys.stderr.write(\
'_follow_and_sleep: urllib2.URLError (1): %s' % e.reason)
self._sleep()
continue
elif text_regex != None:
while True:
try:
self.br.follow_link(text_regex=text_regex, nr=nr)
break
except urllib2.URLError, e: # time out, etc.
if self.debug_level > 0:
sys.stderr.write(\
'_follow_and_sleep: urllib2.URLError (2): %s' % e.reason)
self._sleep()
continue
else:
raise StandardError, \
"_follow_and_sleep requires text_regex or url argument"
self._log_page()
self._sleep()
def _read_wrapper(self, connection):
""" Reads data from a connection, with error checking. """
while True:
try:
buffer = connection.read()
except socket.error, e: # usually host reset connection
if self.debug_level > 0:
sys.stderr.write('_read_wrapper: socket.error %s' % e.reason)
self.br.reload()
connection = self.br.response()
self._sleep()
retries += 1
continue
break
return buffer
def _download_photo_and_sleep(self, photo):
""" Downloads a given photo and sleeps. """
photo_url = photo['src']
self._log_page(photo_url)
filename = photo_url[photo_url.rindex('/')+1:]
if os.path.exists(filename):
return
else:
self.br.open(url=photo_url)
f = open(filename, 'w')
f.write(self._read_wrapper(self.br.response()))
f.close()
self.num_photos_downloaded += 1
self.br.back()
self._sleep()
def login(self):
""" Logs in to Facebook. """
self.br.open(self.login_url)
self._log_page()
self.br.select_form(nr=0)
self.br['email'] = self.email
self.br['pass'] = self.password
self.br.submit()
if self.br.geturl().find('login') != -1:
raise StandardError, \
'Facebook login failed... check your email/password'
self._log_page()
self.logged_in = True
self._sleep()
def export_my_photos(self):
""" Exports your photos from Facebook. """
self._descend_into_dir('My Photos')
# get to album page
if not self.logged_in:
self.login()
self._follow_and_sleep('Profile')
self._follow_and_sleep('View Photos of Me.*')
self._follow_and_sleep('Album', nr=2)
# get all albums
album_url = self.br.geturl()
soup = self._response_soup()
frame = soup.find('form', {'id' : 'photos_privacy_form'})
dupalbums = frame.findAll('a', {'href' : re.compile('album\.php')})
albums = [dupalbums[x] for x in range(len(dupalbums)) if x % 2 != 0]
albums = albums[self.start_at_album:]
for i, album in enumerate(albums):
if self.num_photos_downloaded >= self.photo_download_cap:
break
album_name = album.contents[0]
if self.debug_level == 1:
print "Downloading album: '%s' (%i/%i)" \
% (album_name, i + 1, len(albums))
if not os.path.exists(album_name):
os.mkdir(album_name)
os.chdir(album_name)
if self.br.geturl() != album_url:
self._open_and_sleep(album_url)
self._download_album(album)
os.chdir('..')
def _download_album(self, album):
""" Download the given album. """
# get to first photo page
self._follow_and_sleep(url=album['href'])
soup = self._response_soup()
frame = soup.find('div', {'id' : 'album_container'})
first_photo = frame.find('a')
self._follow_and_sleep(url=first_photo['href'])
first_photo_url = self.br.geturl()
# start downloading photos
while self.num_photos_downloaded <= self.photo_download_cap:
soup = self._response_soup()
photo = soup.find('img', {'id' : 'myphoto'})
self._download_photo_and_sleep(photo)
self._follow_and_sleep('Next')
if first_photo_url == self.br.geturl():
break # end of photos
def _descend_into_dir(self, name):
""" Manages the setup of export directories. """
dirname = os.path.join(self.download_root_path, name)
if os.path.exists(dirname):
if self.overwrite_directories:
_rm_rf(dirname)
os.mkdir(dirname)
elif not self.resume_execution:
raise OSError, "'%s' already exists" % dirname
else:
if not self.resume_execution:
os.mkdir(dirname)
os.chdir(dirname)
def export_photos_of_me(self):
""" Exports all photos that have you tagged. """
self._descend_into_dir('Photos of Me')
# get to first photo page
if not self.logged_in:
self.login()
self._follow_and_sleep('Profile')
self._follow_and_sleep('View Photos of Me.*')
soup = self._response_soup()
frame = soup.find('div', {'id' : 'photos_of_wrapper'})
first_photo = frame.find('a')
self._follow_and_sleep(url=first_photo['href'])
first_photo_pid = None
# start downloading photos
while self.num_photos_downloaded <= self.photo_download_cap:
url_args = urlparse.urlparse(self.br.geturl())[4]
pid_index = url_args.find('pid') + 4
current_photo_pid = \
url_args[pid_index:url_args[pid_index:].find('&') + pid_index]
if not first_photo_pid:
# first photo
first_photo_pid = current_photo_pid
elif first_photo_pid == current_photo_pid:
# end of photos
return
soup = self._response_soup()
photo = soup.find('img', {'id' : 'myphoto'})
self._download_photo_and_sleep(photo)
self._follow_and_sleep('Next')
def export_my_contacts(self):
""" Exports Facebook contact ('friends') info. """
raise NotImplementedError
# self._descend_into_dir("Contacts")
#
# # get to first page of all friends
# if not self.logged_in:
# self.login()
# self._follow_and_sleep('Profile')
# soup = self._response_soup()
# everyone_links = soup.findAll('a', {'href' : re.compile('everyone')})
# self._follow_and_sleep(url=everyone_links[1]['href'])
#
# # start downloading contact info
# try:
# while True:
# soup = self._response_soup()
# frame = soup.find('div', {'id' : 'friends_target'})
# contacts = frame.findAll('a', {'class' : 'fname'})
# if len(contacts) <= 1:
# raise StandardError, 'No contacts found on contacts page!'
#
# for contact in contacts:
# if self.num_contacts_downloaded >= self.contacts_download_cap:
# return
# self._download_contact(contact)
# self._follow_and_sleep('Next')
#
# except mechanize.LinkNotFoundError:
# # end of friends
# return
def _response_soup(self):
""" Returns a BeautifulSoup instance of the current page. """
return BeautifulSoup.BeautifulSoup(\
self._read_wrapper(self.br.response()))
# def _download_contact(self, contact):
# self._follow_and_sleep(url=contact['href'])
# self._follow_and_sleep('Info')
# self._save_contact_info(contact.contents[0])
# self.br.back(nr=2)
# self.num_contacts_downloaded += 1
#
# def _save_contact_info(self, name):
# soup = self._response_soup()
#
# print name
# basic_info = soup.find('div', {'id' : 'info_section_info_basic'})
# if basic_info:
# headings = basic_info.findAll('dt')
# entries = basic_info.findAll('dd')
# for i in range(len(headings)):
# print "\t%20s%20s" % \
# (headings[i].contents[0], entries[i].contents[0])
#
# contact_info = soup.find('div', {'id' : 'info_section_info_contact'})
# if contact_info:
# headings = basic_info.findAll('dt')
# entries = basic_info.findAll('dd')
# for i in range(len(headings)):
# print "\t%20s%20s" % (headings[i], entries[i])
if __name__ == '__main__':
timeout = 10
socket.setdefaulttimeout(timeout)
max_int = 2**32 - 1
usage = """faceoff.py [options] -- [actions]
Supported actions:
'myphotos' export all of your photos
'photosofme' export all photos that have you tagged in them
'contacts' export your contacts (Facebook's 'friends')
"""
parser = optparse.OptionParser(usage=usage)
# required arguments
required = optparse.OptionGroup(parser, 'Required Arguments')
required.add_option("-e", "--email", dest="email",
help="Facebook login email (required)", metavar="EMAIL")
required.add_option("-p", "--password", dest="password",
help="Facebook login password (required)", metavar="PASS")
parser.add_option_group(required)
# optional arguments
optional = optparse.OptionGroup(parser, 'Optional Arguments')
optional.add_option("-r", "--export-root-dir", dest="download_root_path",
help="set root export directory (default cwd)",
metavar="DIR", default=os.getcwd())
optional.add_option("-d", "--debug_level", dest="debug_level", type="int",
help="set debugging level (default 0)", metavar="LEVEL", default=0)
optional.add_option("-u", "--user-agent", dest="user_agent",
help="user agent string to use (default IE6)", default='IE6')
optional.add_option("-t", "--sleep-time", dest="sleep_time", type="float",
help="time to sleep after each request (default 1.0)", default=1.0)
optional.add_option("-c", "--photo-download-cap",
dest="photo_download_cap", type="int",
help="download max N photos", default=max_int, metavar="N")
optional.add_option("-x", "--contacts-download-cap",
dest="contacts_download_cap", type="int",
help="download max N albums", default=max_int, metavar="N")
optional.add_option("-n", "--start-at-album", dest="start_at_album",
help="for myphotos, start downloading at album number N "
"(inclusive, count starting at 0)",
default=0, metavar="N", type="int")
parser.add_option_group(optional)
# optional flags
flags = optparse.OptionGroup(parser, 'Optional Flags')
flags.add_option("-o", "--overwrite_directories",
dest="overwrite_directories", help="overwrite export directories",
action="store_true", default=False)
flags.add_option("-s", "--resume-execution", dest="resume_execution",
help="pick up scraping where you last left off", action="store_true",
default=False)
flags.add_option("-g", "--use-getpass", dest="use_getpass",
help="use getpass instead of -p flag", action="store_true",
default=False)
parser.add_option_group(flags)
# parse options
options, actions = parser.parse_args(sys.argv[1:])
# use getpass, if wanted
if options.use_getpass:
options.password = getpass.getpass('Enter Facebook password: ')
# instantiate FacebookScraper
fs = FacebookScraper(options)
# dispatch actions
for action in actions:
if known_actions[action]:
if options.debug_level == 1:
print known_actions[action]
func = getattr(fs, known_actions[action])
func()
else:
raise StandardError, "Unknown Action '%s'" % action
Show details
Hide details
Change log
r6
by dannycolligan on Apr 05, 2009
Diff
Added faceoff.py
Go to:
/trunk/faceoff.py
Project members,
sign in
to write a code review
Older revisions
All revisions of this file
File info
Size: 14759 bytes, 452 lines
View raw file
Hosted by