My favorites | Sign in
Project Hosting will be READ-ONLY Thursday at 3:00pm UTC for up to 3 hours for network maintenance.
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
For a given Wikipedia article and time period, this script looks up all
the changes introduced by each editor and tallies what words they used or
removed.
"""

# TODO:
# 1. Accept start/end date ranges. Parse strings in the yyyy-mm-dd format.
# 2. Test mwclient's start/end specification for revisions.
# 3. Ask mwclient to get full content of pages and in old-to-new sequence.
# 4. Keep track of previous revision content.
# 5. Remove code related to other pages by the same editor.
# 6. Perform a diff to get exact words added or removed. Tricky bit.
# 7. Sort words and use counts (number added, number removed, separately!)
# 8. Save to CSV(s).

import sys
from optparse import OptionParser
import datetime
import csv
import mwclient
from worddiff import worddiff, striptags
from timehelper import parsedate, MWDATEFMT

class IgnoreEditor(Exception):
pass

def analyze(article, start, end, wiki, ignore, outfile):
start = parsedate(start)
end = parsedate(end)

print "Connecting to %s..." % wiki
site = mwclient.Site(wiki)
print "Looking up revisions between %s and %s." % (
start.strftime(MWDATEFMT), end.strftime(MWDATEFMT))
print 'Revisions for "%s":' % article,
sys.stdout.flush()
page = site.Pages[article]
if 0: assert isinstance(page, mwclient.page.Page)

# Place all editors in a dictionary
editors = {} # Dictionary of word: (add count, remove count)
edit_counts = {}
allwords = set() # Set of all words found across editors

revisions = page.revisions(start=start.strftime(MWDATEFMT),
end=end.strftime(MWDATEFMT),
dir='newer')

try:
rev = revisions.next()
startid = rev['revid']
except StopIteration:
raise NoRevisions

# We have to do this to work around a bug in mwclient.
revisions = page.revisions(startid=startid,
end=end.strftime(MWDATEFMT),
dir='newer',
prop='ids|timestamp|flags|comment|user|content')

# This is a typical revision (from the Bangalore page):
# {u'comment': u'/* Culture and education */',
# u'timestamp': (2006, 1, 3, 4, 10, 37, 1, 3, -1),
# u'anon': u'', u'revid': 33686266, u'user': u'59.92.138.207'}

try:
revision_count = 0
rev = revisions.next()
oldcontent = striptags(rev[u'*'])
while True:
print '\rRevisions for "%s": %d on %s' % (article, revision_count,
'%04d/%02d/%02d %02d:%02d:%02d' % rev[u'timestamp'][:6]),
sys.stdout.flush()
rev = revisions.next() # Use first revision only as reference point
revision_count += 1
# Ignore editors we've been asked to
if rev[u'user'] not in ignore:
edit_counts[rev[u'user']] = edit_counts.get(rev[u'user'], 0) + 1
editorstats = editors.get(rev[u'user'], {})
# Analyse content of this edit for words added and removed
newcontent = striptags(rev[u'*'])
for change, word in worddiff(oldcontent, newcontent):
allwords.add(word)
wordstats = editorstats.get(word, (0, 0))
if change == '+':
editorstats[word] = (wordstats[0] + 1, wordstats[1])
elif change == '-':
editorstats[word] = (wordstats[0], wordstats[1] + 1)
editors[rev[u'user']] = editorstats
# Done. On to next revision.
oldcontent = newcontent
except StopIteration:
pass

print

editor_names = editors.keys()
editor_names.sort() # Will make table generation easier further down
print "Found %d editors with %d words: %s" % (len(editor_names), len(allwords),
u', '.join(editor_names).encode('utf-8'))

if outfile:
print "Saving table to %s..." % outfile
out = csv.writer(open(outfile, 'wb'))
if 0: assert isinstance(out, csv.DictWriter)
out.writerow(['Word/Editor'] + [e.encode('utf-8') for e in editor_names])
out.writerow(['Edit Count'] + [edit_counts[e] for e in editor_names])
sorted_words = list(allwords)
sorted_words.sort()
for word in sorted_words:
row = [editors[e].get(word, (0,0)) for e in editor_names]
out.writerow([word.encode('utf-8')]+row)
print "All Done."

def main(argv):
'''%prog [options] "Article Name"'''
parser = OptionParser(usage=main.__doc__)
parser.add_option('-s', '--start',
default=(datetime.datetime.utcnow() - datetime.timedelta(weeks=4)).strftime('%Y-%m-%d'),
help='Starting date [default %default]')
parser.add_option('-e', '--end', default='now',
help="Ending date (or one of now/today/yesterday) [default %default]")
parser.add_option('-w', '--wiki', type='string', default='en.wikipedia.org',
help='MediaWiki site to look up [default %default]')
parser.add_option('-i', '--ignore', type='string', action='append', default=[],
help='Ignore edits by this editor'\
' (useful to filter maintenance bots)')
parser.add_option('-o', '--output', type='string', default=None,
help='CSV file to save output to')
(options, parms) = parser.parse_args(argv)

if len(parms) < 1:
print >> sys.stderr, "Article must be specified."
return 1
elif len(parms) > 1:
print >> sys.stderr, "Only one article name may be specified."
return 1

return analyze(parms[0], options.start, options.end, options.wiki,
options.ignore, options.output)


if __name__=='__main__':
sys.exit(main(sys.argv[1:]))

Change log

r9 by jackerhack on Jun 19, 2009   Diff
Fixes for pulling in 50+ revisions in
script 2.
Go to: 
Project members, sign in to write a code review

Older revisions

r7 by jackerhack on May 21, 2009   Diff
New script to calculate moving average
of edit count per interval (default 1
day) for a given article.
r6 by jackerhack on Mar 19, 2009   Diff
Added count of total number of edits
per editor.
r5 by jackerhack on Mar 17, 2009   Diff
Script 2: analysis by words added or
deleted from the given page over a
given date range. The Python mwclient
library has a bug that causes this
script to fail if there are more than
...
All revisions of this file

File info

Size: 5945 bytes, 147 lines

File properties

svn:mergeinfo
svn:eol-style
native
svn:keywords
Revision
Powered by Google Project Hosting