My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
# Copyright (C) 2001-2011 NLTK Project
# Author: Steven Bird <sb@csse.unimelb.edu.au>
#
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import gutenberg, genesis, inaugural,\
nps_chat, webtext, treebank, wordnet
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading text1, ..., text9 and sent1, ..., sent9"
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print "text6:", text6.name

text7 = Text(treebank.words(), name="Wall Street Journal")
print "text7:", text7.name

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print "text8:", text8.name

text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print "text9:", text9.name

def texts():
print "text1:", text1.name
print "text2:", text2.name
print "text3:", text3.name
print "text4:", text4.name
print "text5:", text5.name
print "text6:", text6.name
print "text7:", text7.name
print "text8:", text8.name
print "text9:", text9.name

sent1 = ["Call", "me", "Ishmael", "."]
sent2 = ["The", "family", "of", "Dashwood", "had", "long",
"been", "settled", "in", "Sussex", "."]
sent3 = ["In", "the", "beginning", "God", "created", "the",
"heaven", "and", "the", "earth", "."]
sent4 = ["Fellow", "-", "Citizens", "of", "the", "Senate",
"and", "of", "the", "House", "of", "Representatives", ":"]
sent5 = ["I", "have", "a", "problem", "with", "people",
"PMing", "me", "to", "lol", "JOIN"]
sent6 = ['SCENE', '1', ':', '[', 'wind', ']', '[', 'clop', 'clop',
'clop', ']', 'KING', 'ARTHUR', ':', 'Whoa', 'there', '!']
sent7 = ["Pierre", "Vinken", ",", "61", "years", "old", ",",
"will", "join", "the", "board", "as", "a", "nonexecutive",
"director", "Nov.", "29", "."]
sent8 = ['25', 'SEXY', 'MALE', ',', 'seeks', 'attrac', 'older',
'single', 'lady', ',', 'for', 'discreet', 'encounters', '.']
sent9 = ["THE", "suburb", "of", "Saffron", "Park", "lay", "on", "the",
"sunset", "side", "of", "London", ",", "as", "red", "and",
"ragged", "as", "a", "cloud", "of", "sunset", "."]

def sents():
print "sent1:", " ".join(sent1)
print "sent2:", " ".join(sent2)
print "sent3:", " ".join(sent3)
print "sent4:", " ".join(sent4)
print "sent5:", " ".join(sent5)
print "sent6:", " ".join(sent6)
print "sent7:", " ".join(sent7)
print "sent8:", " ".join(sent8)
print "sent9:", " ".join(sent9)

Change log

r8730 by StevenBird1 on Mar 7, 2011   Diff
Updated NLTK copyright year range from
2001-2010 to 2001-2011
Go to: 
Sign in to write a code review

Older revisions

r8694 by StevenBird1 on Oct 12, 2010   Diff
Coerce text3 to ascii, for consistency
with published book.
r8484 by StevenBird1 on Feb 2, 2010   Diff
Fixed copyright statements
r8483 by StevenBird1 on Feb 1, 2010   Diff
Added missing copyright and license
statements.
Removed PyYAML
All revisions of this file

File info

Size: 3289 bytes, 88 lines
Powered by Google Project Hosting