My favorites
▼
|
Sign in
django-galaxy
A reusable application for building aggregated blogging sites
Project Home
Downloads
Wiki
Issues
Source
Checkout
Browse
Changes
Source path:
svn
/
trunk
/
utilities.py
r2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
from BeautifulSoup import BeautifulSoup
RELATED_TOPICS = {
'django' : [
'pownce',
'everyblock',
'holovaty',
'ellington',
'jellyroll',
'appengine',
'byteflow',
'formwizard',
'newforms',
'feedclowd',
'modelforms',
'multipleinheritance',
'queryset',
'qsrf',
]
}
def clean_body(body):
headings_start = re.compile(r'(<[h|H]\d{1}>)')
headings_end = re.compile(r'(</?[h|H]\d{1}>)')
divs = re.compile(r'(<[/]?div.*?>)')
comments = re.compile(r'(<!--.*?-->)')
body = divs.sub('', body)
body = headings_start.sub('<p class="heading">', body)
body = headings_end.sub('</p>', body)
body = comments.sub('', body)
# Remove junky feedburner links:
# Note, we don't remove all links that reference feedburner,
# only those which contain image elements that reference
# feedburner.
# You cannot simply remove all links that point to feedburner
# because some publishers use a feature that rewrites all links
# in the content to proxy through FB for tracking purposes.
if 'feedburner' in body:
soup = BeautifulSoup(body)
images = soup.findAll('img', src=re.compile('feedburner'))
for i in images:
# Remove the parent link (and by association, the image)
i.parent.extract()
body = unicode(soup) # Using unicode to be nice, I guess. str()
# might work just as well.
return body.strip()
def clean_title(title):
bracketed_text = re.compile(r'\[(.*?)\]')
title = bracketed_text.sub('', title)
return title.strip()
def is_about_topic(data, topic, check_related=False):
"""This function will eventually seek to
determine if a post is about a topic.
For now the most simplistic method is
to simply check for a string matching
the topic in the provided data.
In the future, we might maintain a table
of strings related to a topic and look
for those too.
Additionally, we might return a confidence
value. As we find more blogs that mention
the word 'django' we might want to check
if they're more programmy than music'y,
for example. We'd look for terms that are
mostly found in posts that are about
genuine django posts.
Also, we might weight stories higher that
have a higher density of django to non-django
keywords.
"""
for item in data:
if topic in item.lower():
return True
if check_related:
for related_term in RELATED_TOPICS[topic]:
if related_term in item.lower():
return True
Show details
Hide details
Change log
r2
by clintecker on Jul 9, 2008
Diff
Bringing everything in for the first time
Go to:
/trunk/__init__.py
/trunk/debugging.py
/trunk/managers.py
/trunk/models.py
/trunk/scripts
/trunk/scripts/get_new_posts.py
/trunk/templates
/trunk/templates/base.html
/trunk/templates/galaxy
.../templates/galaxy/post_list.html
/trunk/time_utilities.py
/trunk/urls.py
/trunk/utilities.py
/trunk/views.py
Project members,
sign in
to write a code review
Older revisions
All revisions of this file
File info
Size: 2831 bytes, 88 lines
View raw file
Powered by
Google Project Hosting