My favorites
▼
|
Sign in
guestbook-example-appengine-full-text-search
Example App Engine project that uses self merge joins to do some basic full text search
Project Home
Downloads
Wiki
Issues
Source
Checkout
Browse
Changes
Source path:
svn
/
trunk
/
guestbook
/
src
/
guestbook
/
SearchJanitorUtils.java
r30
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package guestbook;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class SearchJanitorUtils {
private static final Logger log = Logger.getLogger(SearchJanitorUtils.class.getName());
/** From StopAnalyzer Lucene 2.9.1 */
public final static String[] stopWords = new String[]{
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
/**
* Uses english stemming (snowball + lucene) + stopwords for getting the words.
*
* @param index
* @return
*/
public static Set<String> getTokensForIndexingOrQuery(
String index_raw,
int maximumNumberOfTokensToReturn) {
String indexCleanedOfHTMLTags = index_raw.replaceAll("\\<.*?>"," ");
Set<String> returnSet = new HashSet<String>();
try {
Analyzer analyzer = new SnowballAnalyzer(
org.apache.lucene.util.Version.LUCENE_CURRENT,
"English",
stopWords);
TokenStream tokenStream = analyzer.tokenStream(
"content",
new StringReader(indexCleanedOfHTMLTags));
Token token = new Token();
while (((token = tokenStream.next()) != null)
&& (returnSet.size() < maximumNumberOfTokensToReturn)) {
returnSet.add(token.term());
}
} catch (IOException e) {
log.severe(e.getMessage());
}
return returnSet;
}
}
Show details
Hide details
Change log
r3
by raphael.andre.bauer on Apr 7, 2010
Diff
initial checkin
Go to:
/trunk/guestbook/.classpath
/trunk/guestbook/.project
/trunk/guestbook/.settings
...gle.appengine.eclipse.core.prefs
...om.google.gdt.eclipse.core.prefs
/trunk/guestbook/COPYING
/trunk/guestbook/build.xml
/trunk/guestbook/src
/trunk/guestbook/src/META-INF
...tbook/src/META-INF/jdoconfig.xml
/trunk/guestbook/src/guestbook
...rc/guestbook/GuestBookEntry.java
...guestbook/src/guestbook/PMF.java
...src/guestbook/SearchJanitor.java
...uestbook/SearchJanitorUtils.java
...k/guestbook/src/log4j.properties
/trunk/guestbook/war
/trunk/guestbook/war/WEB-INF
.../war/WEB-INF/appengine-generated
...rated/datastore-indexes-auto.xml
...appengine-generated/local_db.bin
...ok/war/WEB-INF/appengine-web.xml
/trunk/guestbook/war/WEB-INF/lib
...EB-INF/lib/lucene-core-2.9.1.jar
...NF/lib/lucene-snowball-2.9.1.jar
...k/war/WEB-INF/logging.properties
...nk/guestbook/war/WEB-INF/web.xml
/trunk/guestbook/war/guestbook.jsp
/trunk/guestbook/war/search.jsp
Project members,
sign in
to write a code review
Older revisions
All revisions of this file
File info
Size: 1818 bytes, 77 lines
View raw file
File properties
svn:mime-type
text/plain
Powered by
Google Project Hosting