My favorites | Sign in
Project Logo
                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#! /usr/bin/env python
#
# couchdb-xapian-indexer - Index couchdb databases with Xapian.
# Copyright (C) 2008 Paul J. Davis <paul.joseph.davis@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

import logging
import os
import sys
from optparse import OptionParser, make_option

import couchdb
import simplejson
import xapian

log = logging.getLogger(__name__)

class Index(object):
def __init__(self, dir, url, dbname):
log.info("Creating indexer for: %s" % dbname)
self.idxfile = os.path.join(dir, '%s.idx' % dbname)
self.server = couchdb.Server(url)
self.dbname = dbname
self.xdb = xapian.WritableDatabase(self.idxfile, xapian.DB_CREATE_OR_OPEN)
self.index = xapian.TermGenerator()
self.index.set_stemmer(xapian.Stem("english"))
self.startkey = self.xdb.get_metadata("startkey")
if self.startkey:
self.startkey = simplejson.loads(self.startkey)
self.batch_size = 1000
def reindex(self):
log.info("Reindexing %s" % self.dbname)
try:
if self.dbname not in self.server:
return
db = self.server[self.dbname]
self.xdb.begin_transaction()
params = {'count': self.batch_size}
if self.startkey:
params['startkey'] = self.startkey
docs = db.view('_all_docs_by_seq', **params)
while len(docs) > 0:
for doc in docs:
self.startkey = doc.key
xuid = "COUCHDB_ID_%s" % doc.id
if doc.value.get('deleted', False):
self.xdb.delete_document(doc.id)
else:
cdoc = db[doc.id]
xdoc = xapian.Document()
xdoc.add_term(xuid)
xdoc.set_data(doc.id)
self.index.set_document(xdoc)
self.index.index_text(' '.join(map(lambda x: str(cdoc.get(x, '')), [k for k in cdoc])))
self.xdb.replace_document(xuid, xdoc)
params['startkey'] = self.startkey
docs = db.view('_all_docs_by_seq', **params)
self.xdb.set_metadata("startkey", simplejson.dumps(self.startkey))
self.xdb.commit_transaction()
except:
self.xdb.cancel_transaction()
raise

def updates():
line = sys.stdin.readline()
while line:
if not line:
return
obj = simplejson.loads(line)
yield obj
line = sys.stdin.readline()

def main(dir, url, exclude):
if not os.path.isdir(dir):
os.mkdir(dir)
indices = {}
for update in updates():
dbname = update['db']
if dbname in exclude:
continue
if dbname not in indices:
indices[dbname] = Index(dir, url, dbname)
indices[dbname].reindex()

if __name__ == '__main__':
options = [
make_option('-d', '--dir', dest='dir', metavar="DIRECTORY", default="./xapian",
help="Directory in which to store xapian databases. [%default]"),
make_option('-u', '--url', dest='url', metavar="URL", default="http://localhost:5984",
help="URL of the couchdb server. [%default]"),
make_option('-e', '--exclude', dest='exclude', metavar='DB_NAME', default=[],
help="Exclude a database from indexing. Can be used multiple times."),
make_option('-l', '--log', dest='log', metavar="FILE", default='./xapian/index.log',
help="Name of the log file to write to."),
]
parser = OptionParser("usage: %prog [OPTIONS]", option_list=options)
opts, args = parser.parse_args()
if len(args) != 0:
print "Unrecognized arguments: %s" % ' '.join(args)
parser.print_help()
exit(-1)
logging.basicConfig(filename=opts.log, level=logging.DEBUG, format="%(levelname)s %(message)s")
try:
main(os.path.abspath(opts.dir), opts.url, opts.exclude)
except:
log.exception("Indexer shutting down due to high stress. Relaxation needed.")
Show details Hide details

Change log

r27 by paul.joseph.davis on Aug 08, 2008   Diff
Updated the scripts to use the new
_external interface.

Check out http://github.com/davisp/couchdb

Go to: 
Project members, sign in to write a code review

Older revisions

r22 by paul.joseph.davis on Jun 01, 2008   Diff
Adding the Xapian full text search
query server.

Make sure to check out issue
COUCHDB-74 for current
...
All revisions of this file

File info

Size: 4798 bytes, 119 lines

File properties

svn:executable
*
Hosted by Google Code