My favorites | Sign in
Project Home Downloads Issues Source
Repository:
Checkout   Browse   Changes   Clones    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
__doc__ = 'High level functions for extracting and storing data'

import os
import re
import csv
import math
import logging
from collections import defaultdict
import common
import settings
import xpath


def get_excerpt(html, try_meta=False, max_chars=255):
"""Extract excerpt from this HTML by finding largest text block

try_meta indicates whether to try extracting from meta description tag
max_chars is the maximum number of characters for the excerpt
"""
# try extracting meta description tag
excerpt = ''
if try_meta:
excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content')
if not excerpt:
# remove these tags and then find biggest text block
bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags))
if content:
excerpt = max((len(p.strip()), p) for p in content.splitlines())[1]
return common.unescape(excerpt.strip())[:max_chars]


def extract_emails(html):
"""Extract emails and look for common obfuscations

>>> extract_emails('')
[]
>>> extract_emails('hello richard@sitescraper.net world')
['richard@sitescraper.net']
>>> extract_emails('hello richard@<!-- trick comment -->sitescraper.net world')
['richard@sitescraper.net']
>>> extract_emails('hello richard AT sitescraper DOT net world')
['richard@sitescraper.net']
"""
email_re = re.compile('([\w\.-]{1,64})@(\w[\w\.-]{1,255})\.(\w+)')
# remove comments, which can obfuscate emails
html = re.compile('<!--.*?-->', re.DOTALL).sub('', html).replace('mailto:', '')
emails = []
for user, domain, ext in email_re.findall(html):
if ext.lower() not in common.MEDIA_EXTENSIONS and len(ext)>=2 and not re.compile('\d').search(ext) and domain.count('.')<=3:
email = '%s@%s.%s' % (user, domain, ext)
if email not in emails:
emails.append(email)

# look for obfuscated email
for user, domain, ext in re.compile('([\w\.-]{1,64})\s?.?AT.?\s?([\w\.-]{1,255})\s?.?DOT.?\s?(\w+)', re.IGNORECASE).findall(html):
if ext.lower() not in common.MEDIA_EXTENSIONS and len(ext)>=2 and not re.compile('\d').search(ext) and domain.count('.')<=3:
email = '%s@%s.%s' % (user, domain, ext)
if email not in emails:
emails.append(email)
return emails


def parse_us_address(address):
"""Parse usa address
>>> parse_us_address('6200 20th Street, Vero Beach, FL 32966')
('6200 20th Street', 'Vero Beach', 'FL', '32966')
"""
city = state = zipcode = ''
addrs = map(lambda x:x.strip(), address.split(','))
if addrs:
m = re.compile('([A-Z]{2,})\s*(\d[\d\-\s]+\d)').search(addrs[-1])
if m:
state = m.groups()[0].strip()
zipcode = m.groups()[1].strip()

if len(addrs)>=3:
city = addrs[-2].strip()
address = ','.join(addrs[:-2])
else:
address = ','.join(addrs[:-1])

return address, city, state, zipcode


def distance(p1, p2):
"""Calculate distance between 2 (latitude, longitude) points
Multiply result by radius of earth (6373 km, 3960 miles)
"""
lat1, long1 = p1
lat2, long2 = p2
# Convert latitude and longitude to
# spherical coordinates in radians.
degrees_to_radians = math.pi/180.0

# phi = 90 - latitude
phi1 = (90.0 - lat1)*degrees_to_radians
phi2 = (90.0 - lat2)*degrees_to_radians

# theta = longitude
theta1 = long1*degrees_to_radians
theta2 = long2*degrees_to_radians

# Compute spherical distance from spherical coordinates.

# For two locations in spherical coordinates
# (1, theta, phi) and (1, theta, phi)
# cosine( arc length ) =
# sin phi sin phi' cos(theta-theta') + cos phi cos phi'
# distance = rho * arc length

cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) + math.cos(phi1)*math.cos(phi2))
arc = math.acos( cos )

# Remember to multiply arc by the radius of the earth
# in your favorite set of units to get length.
return arc

Change log

73121b3e7863 by Qi on Jan 11, 2012   Diff
update parse_us_address
Go to: 
Project members, sign in to write a code review

Older revisions

90175bef7a60 by Qi on Dec 5, 2011   Diff
update extract_emails
5e35d89c12e5 by Qi on Dec 1, 2011   Diff
fix extract_emails
ac8bdfe5d9b0 by Qi on Nov 30, 2011   Diff
fix extract_emails
All revisions of this file

File info

Size: 4258 bytes, 117 lines
Powered by Google Project Hosting