My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#! /usr/bin/env python
# KNB Corpus reader

import sys

from nltk.tree import bracket_parse, Tree

from nltk.parse import DependencyGraph

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *

# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS').encode('utf-8')

class KNBCorpusReader(SyntaxCorpusReader):
"""
This class implements:
- L{__init__}, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- L{_read_block}, which reads a block from the input stream.
- L{_word}, which takes a block and returns a list of list of words.
- L{_tag}, which takes a block and returns a list of list of tagged
words.
- L{_parse}, which takes a block and returns a list of parsed
sentences.

The structure of tagged words:
tagged_word = (word(str), tags(tuple))
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
"""

def __init__(self, root, fileids, encoding=None, morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
CorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str

def _read_block(self, stream):
# blocks are split by blankline (or EOF) - default
return read_blankline_block(stream)

def _word(self, t):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
res.append(cells[0])

return res

def _tag(self, t, simplify_tags=False):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
# convert cells to morph tuples
res.append( (cells[0], ' '.join(cells[1:])) )

return res

def _parse(self, t):
dg = DependencyGraph()
i = 0
for line in t.splitlines():
if line.startswith("*") or line.startswith("+"):
# start of bunsetsu or tag

cells = line.strip().split(" ", 3)
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])

assert m != None

node = dg.nodelist[i]
node['address'] = i
node['rel'] = m.group(2) # dep_type

node['word'] = []

dep_parent = int(m.group(1))

while len(dg.nodelist) < i+1 or len(dg.nodelist) < dep_parent+1:
dg.nodelist.append({'word':[], 'deps':[]})

if dep_parent == -1:
dg.root = node
else:
dg.nodelist[dep_parent]['deps'].append(i)

i += 1
elif not line.startswith("#"):
# normal morph
cells = line.strip().split(" ")
# convert cells to morph tuples
morph = ( cells[0], ' '.join(cells[1:]) )
dg.nodelist[i-1]['word'].append(morph)

if self.morphs2str:
for node in dg.nodelist:
node['word'] = self.morphs2str(node['word'])

return dg.tree()

######################################################################
# Demo
######################################################################

def demo():

import nltk
from nltk.corpus.util import LazyCorpusLoader

root = nltk.data.find('corpora/knbc/corpus1')
fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

def _knbc_fileids_sort(x):
cells = x.split('-')
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

print knbc.fileids()[:10]
print ''.join( knbc.words()[:100] )

print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

knbc.morphs2str = lambda morphs: '/'.join(
"%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
).encode('utf-8')

print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2] )

def test():

from nltk.corpus.util import LazyCorpusLoader

knbc = LazyCorpusLoader(
'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
assert isinstance(knbc.words()[0], basestring)
assert isinstance(knbc.sents()[0][0], basestring)
assert type(knbc.tagged_words()[0]) == tuple
assert type(knbc.tagged_sents()[0][0]) == tuple

if __name__ == '__main__':
demo()
# test()

Change log

r46 by hagisan on Nov 1, 2010   Diff
updated knbc/chasen corpus reader
Go to: 
Project members, sign in to write a code review

Older revisions

r45 by hagisan on Nov 1, 2010   Diff
updated knbc.py
r44 by hagisan on Oct 22, 2010   Diff
updated knbc reader
r43 by hagisan on Sep 26, 2010   Diff
added NLTK JPbook related codes
All revisions of this file

File info

Size: 5270 bytes, 156 lines

File properties

svn:executable
*
Powered by Google Project Hosting