# If we already started into a book, then this verse encountered
# means that we need to clear the buffer and start a new one
else:
# If the previous verse was empty, then eliminate it
if not rawVerses[i][-1]['data']:
rawVerses[i].pop()
rawVerses[i].append({
'meta':verseMarkerMatch.groupdict(),
'data':''
})
continue
#Append a space to the previous verse if this verse doesn't begin with
#if line and len(rawVerses[i]) > 1 and rawVerses[i][-1]['data'] and not re.match(r'^(<(PB|SB)>|\s)', line):
# print line
# rawVerses[i][-1]['data'] += ' '
#if len(rawVerses[i]) > 1 and not rawVerses[i][-2]['data'].endswith(" "):
# rawVerses[i][-2]['data'] += " "
# Append data to the opened verse buffer
if line and rawVerses[i][-1]['data'] and not rawVerses[i][-1]['data'].startswith('[D ') and not re.match(ur'^(<.+?>)+$', rawVerses[i][-1]['data']) and not re.match(ur'^(<.+?>)+$', line):
rawVerses[i][-1]['data'] += "\n"
rawVerses[i][-1]['data'] += line
if files[0].closed and files[1].closed:
break
# When all is said and done, the two verse counts should be the same
assert(len(rawVerses[0]) == len(rawVerses[1]))
# Add-in additional whitespace to ends of verses if the following verse is <del>in same book</del>
# does not begin with <SB>, <PB>, or whitespace
for rvv in rawVerses:
for i in range(0, len(rvv)-1):
if i+1 >= len(rvv):
break
if rvv[i]['data'].startswith('[D ') and not re.match(ur'^(\s|<PB>|<SB>)', rvv[i+1]['data']):
print "Position is same (%s) but data is different \"%s\" != \"%s\"!" % (finalTokenPosition, import_helpers.normalize_token(placeholderToken.data), import_helpers.normalize_token(previousToken.data))
raise Exception("Position is same (%s) but data is different \"%s\" != \"%s\"!" % (finalTokenPosition, import_helpers.normalize_token(placeholderToken.data), import_helpers.normalize_token(previousToken.data)))
# Now take the placeholderToken and convert it into a real token and insert it into the database
#placeholderToken.certainty = None #Handled by variantGroupToken
placeholderToken.position = finalTokenPosition
token = placeholderToken.establish()
tokens.append(token)
updateRefs(i)
def updateRefs(*ii):
"Here we need to associate the previously inserted Refs with this token, and then save them"
global bookRefs, chapterRefs, verseRefs, pageRefs, tokens
for i in ii:
for refsGroup in (bookRefs[i], chapterRefs[i], verseRefs[i], pageRefs[i]):
if not len(refsGroup):
continue
lastRef = refsGroup[-1]
# If the refClass has not been saved, then the start_token hasn't been set
if not lastRef.id:
lastRef.start_token = tokens[-1]
lastRef.save()
# Set the end_token to the last token parsed
lastRef.end_token = tokens[-1]
# Set the parent of chapters and verses
if not lastRef.parent:
if lastRef.type == Ref.CHAPTER:
lastRef.parent = bookRefs[i][-1]
if lastRef.type == Ref.VERSE:
lastRef.parent = chapterRefs[i][-1]
# Grab tokens from TNT1 and TNT2
placeholderTokens = [[], []]
for token in verseTokens(0):
placeholderTokens[0].append(token)
for token in verseTokens(1):
placeholderTokens[1].append(token)
# Now merge placeholderTokens[0] and placeholderTokens[1]; remember, the differences are going to
# be in punctuation and in accentation; we need to ensire that placeholderTokens with
# differing accentation are stored in the same position
# Now merge placeholderTokens[0] with placeholderTokens[1], and the result will be yielded back
# 'replace' a[i1:i2] should be replaced by b[j1:j2].
# 'delete' a[i1:i2] should be deleted. Note that j1 == j2 in this case.
# 'insert' b[j1:j2] should be inserted at a[i1:i1]. Note that i1 == i2 in this case.
# 'equal' a[i1:i2] == b[j1:j2] (the sub-sequences are equal).
# However, we need to not set variant_group to None if the certainty is not the same!!!