My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package guestbook;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;

public class SearchJanitorUtils {


private static final Logger log = Logger.getLogger(SearchJanitorUtils.class.getName());

/** From StopAnalyzer Lucene 2.9.1 */
public final static String[] stopWords = new String[]{
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};

/**
* Uses english stemming (snowball + lucene) + stopwords for getting the words.
*
* @param index
* @return
*/
public static Set<String> getTokensForIndexingOrQuery(
String index_raw,
int maximumNumberOfTokensToReturn) {

String indexCleanedOfHTMLTags = index_raw.replaceAll("\\<.*?>"," ");


Set<String> returnSet = new HashSet<String>();

try {

Analyzer analyzer = new SnowballAnalyzer(
org.apache.lucene.util.Version.LUCENE_CURRENT,
"English",
stopWords);


TokenStream tokenStream = analyzer.tokenStream(
"content",
new StringReader(indexCleanedOfHTMLTags));

Token token = new Token();

while (((token = tokenStream.next()) != null)
&& (returnSet.size() < maximumNumberOfTokensToReturn)) {

returnSet.add(token.term());

}

} catch (IOException e) {
log.severe(e.getMessage());
}

return returnSet;


}




}

Change log

r3 by raphael.andre.bauer on Apr 7, 2010   Diff
initial checkin
Go to: 
Project members, sign in to write a code review

Older revisions

All revisions of this file

File info

Size: 1818 bytes, 77 lines

File properties

svn:mime-type
text/plain
Powered by Google Project Hosting