My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# -*- coding: utf8 -*-

import cPickle
import os
import random
import operator
import glob
from bz2 import BZ2File as CompressedFile

from lru_cache import lru_cache
from src import utiles

class Index(object):
'''Handles the index.'''

def __init__(self, directory):
self._directory = directory

# open the key shelve
keyfilename = os.path.join(directory, "easyindex.key.bz2")
fh = CompressedFile(keyfilename, "rb")
self.key_shelf = cPickle.load(fh)
fh.close()

# see how many id files we have
idsfilename = os.path.join(directory, "easyindex-*.ids.bz2")
filenames = []
for fn in os.listdir(directory):
if fn.startswith("easyindex-") and \
fn.endswith(".ids.bz2"):
filenames.append(fn)
self.idfiles_count = len(filenames)

@lru_cache(20)
def _get_ids_shelve(self, cual):
'''Return the ids index.'''
fname = os.path.join(self._directory, "easyindex-%03d.ids.bz2" % cual)
fh = CompressedFile(fname, "rb")
idx = cPickle.load(fh)
fh.close()
return idx

def _get_info_id(self, allids):
'''Returns the values for the given ids.

As it groups the ids according to the file, is much faster than
retrieving one by one.
'''
# group the id per file
cuales = {}
for i in allids:
cual = utiles.coherent_hash(i) % self.idfiles_count
cuales.setdefault(cual, []).append(i)

# get the info for each file
for cual, ids in cuales.items():
idx = self._get_ids_shelve(cual)
for i in ids:
yield idx[i]

def keys(self):
"""Returns an iterator over the stored keys."""
return self.key_shelf.iterkeys()

def items(self):
'''Returns an iterator over the stored items.'''
for key, allids in self.key_shelf.iteritems():
values = self._get_info_id(allids)
yield key, list(values)

def values(self):
'''Returns an iterator over the stored values.'''
for key, allids in self.key_shelf.iteritems():
values = self._get_info_id(allids)
for v in values:
yield v

def random(self):
'''Returns a random value.'''
cual = random.randint(0, self.idfiles_count - 1)
idx = self._get_ids_shelve(cual)
return random.choice(idx.values())

def __contains__(self, key):
'''Returns if the key is in the index or not.'''
return key in self.key_shelf

def _merge_results(self, results):
# vemos si tenemos algo más que vacio
results = filter(bool, results)
if not results:
return []

# el resultado final es la intersección de los parciales ("and")
intersectados = reduce(operator.iand, (set(d) for d in results))
final = {}
for result in results:
for pagtit, ptje in result.items():
if pagtit in intersectados:
final[pagtit] = final.get(pagtit, 0) + ptje

final = [(pag, tit, ptje) for (pag, tit), ptje in final.items()]
return sorted(final, key=operator.itemgetter(2), reverse=True)

def search(self, keys):
'''Returns all the values that are found for those keys.

The AND boolean operation is applied to the keys.
'''
results = []
for key in keys:
if key not in self.key_shelf:
continue
allids = self.key_shelf[key]
results.append(allids)

if not results:
return []

# do AND between results, and get the values from ids
intersectados = reduce(operator.iand, results)
allvals = self._get_info_id(intersectados)
return allvals

def partial_search(self, keys):
'''Returns all the values that are found for those partial keys.

The received keys are taken as part of the real keys (suffix,
preffix, or in the middle).

The AND boolean operation is applied to the keys.
'''
results = []
for key_search in keys:
# the partial result starts with an empty set, that will be
# filled in the OR reduce
partial_res = [set()]

# search in all the keys, for partial match
for key_stored, allids in self.key_shelf.iteritems():
if key_search in key_stored:
partial_res.append(allids)

# for the same key searched, we do OR to the results
unidos = reduce(operator.ior, partial_res)
results.append(unidos)

if not results:
return []

# do AND between results, and get the values from ids
intersectados = reduce(operator.iand, results)
allvals = self._get_info_id(intersectados)
return allvals

@classmethod
def create(cls, directory, source):
'''Creates the index in the directory.

The "source" generates pairs (key, value) to store in the index. The
key must be a string, the value can be any hashable Python object.

It must return the quantity of pairs indexed.
'''
ids_shelf = {}
key_shelf = {}
ids_cnter = 0
tmp_reverse_id = {}
indexed_counter = 0

# fill them
for key, value in source:
indexed_counter += 1

# process key
if not isinstance(key, basestring):
raise TypeError("The key must be string or unicode")

# docid -> info final
if value in tmp_reverse_id:
docid = tmp_reverse_id[value]
else:
docid = ids_cnter
tmp_reverse_id[value] = docid
ids_cnter += 1
ids_shelf[docid] = value

# keys -> docid
key_shelf.setdefault(key, set()).add(docid)

# save key
keyfilename = os.path.join(directory, "easyindex.key.bz2")
fh = CompressedFile(keyfilename, "wb")
cPickle.dump(key_shelf, fh, 2)
fh.close()

# split ids_shelf in N dicts of about ~5k entries
N = int(round(len(ids_shelf) / 5000.0))
if not N:
N = 1
all_idshelves = [{} for i in range(N)]
for k,v in ids_shelf.iteritems():
cual = utiles.coherent_hash(k) % N
all_idshelves[cual][k] = v

# save dict where corresponds
for cual, shelf in enumerate(all_idshelves):
fname = "easyindex-%03d.ids.bz2" % cual
idsfilename = os.path.join(directory, fname)
fh = CompressedFile(idsfilename, "wb")
cPickle.dump(shelf, fh, 2)
fh.close()

return indexed_counter

Change log

r343 by facundobatista on Apr 7, 2011   Diff
Fixes para no usar más hash!

Go to: 
Project members, sign in to write a code review

Older revisions

r190 by facundobatista on Jan 24, 2010   Diff
Pequeños cambios en el índice para
mejor benchmark.

r179 by facundobatista on Dec 12, 2009   Diff
Revertí el commit de la r178, quedó
esto igual a la r177.

r178 by klaussfreire on Dec 9, 2009   Diff
Commit preliminar de un nuevo índice:
 - docsets (aka postings) comprimidos
en memoria
 - un sistema de tokenización más
poderoso (con stemming)
...
All revisions of this file

File info

Size: 6889 bytes, 215 lines
Powered by Google Project Hosting