Skip to main content
summaryrefslogtreecommitdiffstats
blob: b4f21e056ca4d3b61b88bb3000b787836a023c9b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/*******************************************************************************
 * Copyright (c) 2000, 2016 IBM Corporation and others.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *     Alexander Kurtakov - Bug 460787
 *     Sopot Cela - Bug 466829
 *******************************************************************************/
package org.eclipse.help.internal.search;

import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;

/**
 * Lucene Analyzer for English.
 * LowerCaseAndDigitsTokenizer->StopFilter->PorterStemFilter
 */
public final class Analyzer_en extends Analyzer {
	/**
	 * Constructor for Analyzer_en.
	 */
	public Analyzer_en() {
		super();
	}

	/*
	 * Can't use try-with-resources because the Lucene internally reuses
	 * components. See {@link org.apache.lucene.analysis.Analyzer.ReuseStrategy}
	 */
	@SuppressWarnings("resource")
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
		final Tokenizer source;
		source = new LowerCaseAndDigitsTokenizer();
		TokenStream result = new StopFilter(source, new CharArraySet(getStopWords(), false));
		result = new PorterStemFilter(result);
		return new TokenStreamComponents(source, result);
	}

	private Set<String> stopWords;

	private Set<String> getStopWords() {
		if ( stopWords == null ) {
			stopWords = new HashSet<>();
			for (int i = 0; i < STOP_WORDS.length; i++) {
			    stopWords.add(STOP_WORDS[i]);
			}
		}
		return stopWords;
	}

	/**
	 * Array of English stop words. Differs from StandardAnalyzer's default stop
	 * words by not having "for", "if", and "this" that are java keywords.
	 */
	private final static String[] STOP_WORDS = {"a", //$NON-NLS-1$
			"and", //$NON-NLS-1$
			"are", //$NON-NLS-1$
			"as", //$NON-NLS-1$
			"at", //$NON-NLS-1$
			"be", //$NON-NLS-1$
			"but", //$NON-NLS-1$
			"by", //$NON-NLS-1$
			"in", //$NON-NLS-1$
			"into", //$NON-NLS-1$
			"is", //$NON-NLS-1$
			"it", //$NON-NLS-1$
			"no", //$NON-NLS-1$
			"not", //$NON-NLS-1$
			"of", //$NON-NLS-1$
			"on", //$NON-NLS-1$
			"or", //$NON-NLS-1$
			"s", //$NON-NLS-1$
			"such", //$NON-NLS-1$
			"t", //$NON-NLS-1$
			"that", //$NON-NLS-1$
			"the", //$NON-NLS-1$
			"their", //$NON-NLS-1$
			"then", //$NON-NLS-1$
			"there", //$NON-NLS-1$
			"these", //$NON-NLS-1$
			"they", //$NON-NLS-1$
			"to", //$NON-NLS-1$
			"was", //$NON-NLS-1$
			"will", //$NON-NLS-1$
			"with"}; //$NON-NLS-1$

}

Back to the top