Skip to main content
aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Wolf2019-08-17 06:53:30 -0400
committerPaul Pazderski2019-08-24 08:58:34 -0400
commite50ee43b2cf76c97c833fe400539c00f9ebe8fc7 (patch)
tree9e1c5b8cd936580c9e2e1a698f58e662c343937b
parentc58c96bcb725d38350343f31c7a8988853f19709 (diff)
downloadeclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.tar.gz
eclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.tar.xz
eclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.zip
Bug 545252 - ConfigurableLineTracker: use the Aho-Corasick algorithmI20190826-0640I20190826-0415
The ConfigurableLineTracker had quadratic performance characteristics, making line-end detection even only with the standard line terminators extremely slow already on moderately sized sources. Add an efficient MultiStringMatcher using the Aho-Corasick algorithm to efficiently find any of a fixed set of strings in a text. The algorithm is a well-described efficient general-purpose multi-string matching algorithm based on a trie. Its main search loop is very simple; the more tricky bits are in the construction of the auxiliary links in the trie. The algorithm as described in 1975 returns all matches including overlaps within a text, but it can be adapted fairly easily to support the "leftmost longest match" semantics needed by ConfigurableLineTracker. The Aho-Corasick algorithm has a complexity of O(n + m + z), where n is the text length, m the sum of the length of patterns, and z the number of matches. For line end matching, this is roughly O(n + L), where L is the number of lines. Note that it can produce a quadratic number of matches on degenerate input (patterns "x", "xx", "xxx", and so on, and a text consisting only of "x"s). Adapt ConfigurableLineTracker to use the new MultiStringMatcher. The matcher provides a find() operation to find all (possibly overlapping) occurrences of the pattern strings; this is the normal way the Aho-Corasick algorithm works. An indexOf() method provides a way to efficiently find only the leftmost longest match. Since this new matcher is generally useful and fast, provide a builder interface through which pattern strings can be added iteratively before matching. Change-Id: I3e66482cfca271248caaad79157f843bc2c819d7 Signed-off-by: Thomas Wolf <thomas.wolf@paranor.ch> Also-by: Paul Pazderski <paul-eclipse@ppazderski.de>
-rw-r--r--org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java96
-rw-r--r--org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java2
-rw-r--r--org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java452
-rw-r--r--org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java3
-rw-r--r--org.eclipse.text/META-INF/MANIFEST.MF2
-rw-r--r--org.eclipse.text/pom.xml2
-rw-r--r--org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java25
-rw-r--r--org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java430
8 files changed, 999 insertions, 13 deletions
diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java
new file mode 100644
index 000000000..fe6f59748
--- /dev/null
+++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java
@@ -0,0 +1,96 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Thomas Wolf and others.
+ *
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *******************************************************************************/
+package org.eclipse.text.tests;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.eclipse.jface.text.ConfigurableLineTracker;
+import org.eclipse.jface.text.DefaultLineTracker;
+import org.eclipse.jface.text.GapTextStore;
+
+public class ConfigurableLineTrackerTest extends AbstractLineTrackerTest {
+
+ @Before
+ public void setUp() {
+ fText= new GapTextStore();
+ }
+
+ @After
+ public void tearDown() {
+ fTracker= null;
+ fText= null;
+ }
+
+ /**
+ * Expected length of the tracked delimiter. First entry is length of first
+ * delimiter and so on.
+ */
+ private int[] lineEndLengths;
+
+ @Override
+ protected int getLineOffset(int line, int[] lines) {
+ int offset= 0;
+ for (int i= 0; i < line; i++) {
+ offset += (lines[i] + lineEndLengths[i]);
+ }
+ return offset;
+ }
+
+ private void setLegalDelimiters(String... delimiters) {
+ fTracker = new ConfigurableLineTracker(delimiters);
+ }
+
+ @Test
+ public void testLongestMatch() throws Exception {
+ setLegalDelimiters(DefaultLineTracker.DELIMITERS);
+ set("xxxx\r\nxx");
+ lineEndLengths = new int[] {2};
+ checkLines(new int[] {4, 2});
+ }
+
+ @Test
+ public void testMixedDefaultDelimiter() throws Exception {
+ setLegalDelimiters(DefaultLineTracker.DELIMITERS);
+ set("1234\r\n123\r1234\n1234567\r");
+ lineEndLengths = new int[] { 2, 1, 1, 1 };
+ checkLines(new int[] { 4, 3, 4, 7, 0 });
+ set("first\r\r\nthird");
+ lineEndLengths = new int[] { 1, 2 };
+ checkLines(new int[] { 5, 0, 5 });
+ }
+
+ @Test
+ public void testSingleDelimiter() throws Exception {
+ setLegalDelimiters("\n");
+ set("xxxx\nxx");
+ lineEndLengths = new int[] { 1 };
+ checkLines(new int[] { 4, 2 });
+ }
+
+ @Test
+ public void testAlternativeDelimiters1() throws Exception {
+ // create a line tracker with "show whitespace" characters as delimiter
+ setLegalDelimiters("\u00a4", "\u00b6", "\u00a4\u00b6");
+ set("xxxx\u00a4\u00b6xx");
+ lineEndLengths = new int[] { 2 };
+ checkLines(new int[] { 4, 2 });
+ }
+
+ @Test
+ public void testAlternativeDelimiters2() throws Exception {
+ setLegalDelimiters("{NewLine}", "[NewLine]");
+ set("A line{NewLine}AnotherLine[NewLine}A third line[newline]End");
+ lineEndLengths = new int[] { 9 };
+ checkLines(new int[] { 6, 44 });
+ }
+}
diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java
index 8e2650685..be13f5814 100644
--- a/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java
+++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java
@@ -29,6 +29,8 @@ import org.eclipse.text.tests.templates.TemplatesTestSuite;
*/
@RunWith(Suite.class)
@SuiteClasses({
+ MultiStringMatcherTest.class,
+ ConfigurableLineTrackerTest.class,
LineTrackerTest4.class,
DocumentExtensionTest.class,
LineTrackerTest3.class,
diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java
new file mode 100644
index 000000000..b50f57725
--- /dev/null
+++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java
@@ -0,0 +1,452 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Thomas Wolf and others.
+ *
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *******************************************************************************/
+package org.eclipse.text.tests;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import org.eclipse.jface.text.MultiStringMatcher;
+import org.eclipse.jface.text.MultiStringMatcher.Match;
+
+public class MultiStringMatcherTest {
+
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ private static Match run(String text, String... needles) {
+ return MultiStringMatcher.indexOf(text, 0, needles);
+ }
+
+ private static Match run(String text, int offset, String... needles) {
+ return MultiStringMatcher.indexOf(text, offset, needles);
+ }
+
+ private static void test(Match m, String expected, int index) {
+ assertNotNull("No match", m);
+ assertEquals("Unexpected match", expected, m.getText());
+ assertEquals("Unexpected index", index, m.getOffset());
+ }
+
+ private static void testList(List<Match> matches, String expected) {
+ Collections.sort(matches, (a, b) -> {
+ int cmp = Integer.compare(a.getOffset(), b.getOffset());
+ if (cmp != 0) {
+ return cmp;
+ }
+ return Integer.compare(a.getText().length(), b.getText().length());
+ });
+ assertEquals("Unexpected results", expected, matches.toString());
+ }
+
+ @Test
+ public void test001() throws Exception {
+ Match m = run("dcccacabcccabcc", "ab", "cab");
+ test(m, "cab", 5);
+ }
+
+ @Test
+ public void test002() throws Exception {
+ Match m = run("dcccacabcccabcc", "ab", "abc");
+ test(m, "abc", 6);
+ }
+
+ @Test
+ public void test003() throws Exception {
+ Match m = run("dcccacabcccabcc", "ab", "cxb");
+ test(m, "ab", 6);
+ }
+
+ @Test
+ public void test004() throws Exception {
+ Match m = run("dcccacabcccabcc", "abc", "cabx");
+ test(m, "abc", 6);
+ }
+
+ @Test
+ public void test005() throws Exception {
+ Match m = run("dacabddd", "ac", "cab");
+ test(m, "ac", 1);
+ }
+
+ @Test
+ public void test006() throws Exception {
+ Match m = run("dacabddd", "aca", "cab");
+ test(m, "aca", 1);
+ }
+
+ @Test
+ public void test007() throws Exception {
+ Match m = run("dacabddd", "acab", "cab");
+ test(m, "acab", 1);
+ }
+
+ @Test
+ public void test008() throws Exception {
+ Match m = run("ddddddcac", "ac", "cab");
+ test(m, "ac", 7);
+ }
+
+ @Test
+ public void test009() throws Exception {
+ Match m = run("dddddcacddd", "cacx", "ac");
+ test(m, "ac", 6);
+ }
+
+ @Test
+ public void test010() throws Exception {
+ Match m = run("ddddddcac", "ac", "cac");
+ test(m, "cac", 6);
+ }
+
+ @Test
+ public void test011() throws Exception {
+ Match m = run("a", "a", "b", "ab");
+ test(m, "a", 0);
+ }
+
+ @Test
+ public void test012() throws Exception {
+ Match m = run("b", "a", "b", "ab");
+ test(m, "b", 0);
+ }
+
+ @Test
+ public void test013() throws Exception {
+ Match m = run("ab", "a", "b", "ab");
+ test(m, "ab", 0);
+ }
+
+ @Test
+ public void test014() throws Exception {
+ Match m = run("", "a", "b", "ab");
+ assertNull("Expected no match", m);
+ }
+
+ @Test
+ public void test015() throws Exception {
+ Match m = run("dddca", "ac", "cac", "ab");
+ assertNull("Expected no match", m);
+ }
+
+ @Test
+ public void test016() throws Exception {
+ Match m = run("ab", "ab", "b");
+ test(m, "ab", 0);
+ }
+
+ @Test
+ public void test017() throws Exception {
+ Match m = run("ushers", "he", "she", "his", "hers");
+ test(m, "she", 1);
+ }
+
+ @Test
+ public void test018() throws Exception {
+ Match m = run("dddhisheddd", "he", "she", "his", "hers");
+ test(m, "his", 3);
+ }
+
+ @Test
+ public void test019() throws Exception {
+ Match m = run("sotat", "at", "art", "oars", "soar");
+ test(m, "at", 3);
+ }
+
+ @Test
+ public void test020() throws Exception {
+ Match m = run("xxx", "x", "xx", "xxx");
+ test(m, "xxx", 0);
+ }
+
+ @Test
+ public void test021() throws Exception {
+ Match m = run("xx", "x", "xx", "xxx");
+ test(m, "xx", 0);
+ }
+
+ @Test
+ public void test022() throws Exception {
+ Match m = run("x", "x", "xx", "xxx");
+ test(m, "x", 0);
+ }
+
+ @Test
+ public void test023() throws Exception {
+ Match m = run("Lorem\r\nIpsum", "\n", "\r\n", "\r");
+ test(m, "\r\n", 5);
+ }
+
+ @Test
+ public void test024() throws Exception {
+ Match m = run("dcccacabcccabcc", "ab", "abcd");
+ test(m, "ab", 6);
+ }
+
+ @Test
+ public void test025() throws Exception {
+ Match m = run("dcccacabcccabcc", "abcd", "bccc");
+ test(m, "bccc", 7);
+ }
+
+ @Test
+ public void test026() throws Exception {
+ Match m = run("xxx", 1, "x", "xx", "xxx");
+ test(m, "xx", 1);
+ }
+
+ @Test
+ public void test027() throws Exception {
+ Match m = run("xxx", 2, "x", "xx", "xxx");
+ test(m, "x", 2);
+ }
+
+ @Test
+ public void test028() throws Exception {
+ Match m = run("dddhisheddd", 7, "he", "she", "his", "hers");
+ assertNull("Expected no match", m);
+ }
+
+ @Test
+ public void test029() throws Exception {
+ Match m = run("Lorem001Ipsum", "1", "01", "0");
+ test(m, "0", 5);
+ }
+
+ @Test
+ public void test030() throws Exception {
+ Match m = run("Lorem01Ipsum", "1", "01", "0");
+ test(m, "01", 5);
+ }
+
+ @Test
+ public void test031() throws Exception {
+ Match m = run("ddcababababababcabxdd", "ca", "cabx", "ababc");
+ test(m, "ca", 2);
+ }
+
+ @Test
+ public void test032() throws Exception {
+ Match m = run("ddcababcabxdd", "ca", "cabx", "cababc", "ababc");
+ test(m, "cababc", 2);
+ }
+
+ @Test
+ public void test033() throws Exception {
+ Match m = run("ddcababcdd", "ca", "cabx", "cababc", "ababc");
+ test(m, "cababc", 2);
+ }
+
+ @Test
+ public void test034() throws Exception {
+ Match m = run("ddcababcdd", "cababx", "ababc");
+ test(m, "ababc", 3);
+ }
+
+ @Test
+ public void test035() throws Exception {
+ Match m = run("ddcababcdd", "cababx", "abab", "ababc");
+ test(m, "ababc", 3);
+ }
+
+ @Test
+ public void test036() throws Exception {
+ Match m = run("ddcabababcdd", "ab", "abab", "cabababcy", "cabababcyy");
+ test(m, "abab", 3);
+ }
+
+ @Test
+ public void test037() throws Exception {
+ Match m = run("ddcabababcdd", "ab", "abab", "cabababc", "cabababcyy");
+ test(m, "cabababc", 2);
+ }
+
+ @Test
+ public void test038() throws Exception {
+ Match m = run("ddcabababcdd", "ab", "bababcd", "cabababcy", "cabababcyy");
+ test(m, "ab", 3);
+ }
+
+ @Test
+ public void test039() throws Exception {
+ Match m = run("ddcabababcddf", "bababcd", "bababcddf", "cabababcy", "cabababcyy");
+ test(m, "bababcddf", 4);
+ }
+
+ @Test
+ public void test040() throws Exception {
+ Match m = run("ddcabababcddffxx", "ffxx", "bababcd", "bababcddffy", "cabababcy", "cabababcyy");
+ test(m, "bababcd", 4);
+ }
+
+ @Test
+ public void multi001() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers");
+ List<Match> matches = m.find("ushers", 0);
+ testList(matches, "[[she, 1], [he, 2], [hers, 2]]");
+ }
+
+ @Test
+ public void multi002() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers");
+ List<Match> matches = m.find("dddhisheddd", 0);
+ testList(matches, "[[his, 3], [she, 5], [he, 6]]");
+ }
+
+ @Test
+ public void multi003() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "sh", "is");
+ List<Match> matches = m.find("dddhisheddd", 0);
+ testList(matches, "[[his, 3], [is, 4], [sh, 5], [she, 5], [he, 6]]");
+ }
+
+ @Test
+ public void multi004() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("ab", "about", "at", "ate", "be", "bed", "edge", "get");
+ List<Match> matches = m.find("abedgetab", 0);
+ testList(matches, "[[ab, 0], [be, 1], [bed, 1], [edge, 2], [get, 4], [ab, 7]]");
+ }
+
+ @Test
+ public void multi005() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar");
+ List<Match> matches = m.find("soars", 0);
+ testList(matches, "[[soar, 0], [oars, 1]]");
+ }
+
+ @Test
+ public void multi006() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar");
+ List<Match> matches = m.find("oart", 0);
+ testList(matches, "[[art, 1]]");
+ }
+
+ @Test
+ public void multi007() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar");
+ List<Match> matches = m.find("soarsoars", 0);
+ testList(matches, "[[soar, 0], [oars, 1], [soar, 4], [oars, 5]]");
+ }
+
+ @Test
+ public void multi008() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar");
+ List<Match> matches = m.find("sotat", 0);
+ testList(matches, "[[at, 3]]");
+ }
+
+ @Test
+ public void multi009() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("i", "in", "tin", "sting");
+ List<Match> matches = m.find("sting", 0);
+ testList(matches, "[[sting, 0], [tin, 1], [i, 2], [in, 2]]");
+ }
+
+ @Test
+ public void multi010() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("i", "in", "tin", "sting", "hastings");
+ List<Match> matches = m.find("hastings", 0);
+ testList(matches, "[[hastings, 0], [sting, 2], [tin, 3], [i, 4], [in, 4]]");
+ }
+
+ @Test
+ public void multi011() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("x", "xx", "xxx");
+ List<Match> matches = m.find("xxx", 0);
+ testList(matches, "[[x, 0], [xx, 0], [xxx, 0], [x, 1], [xx, 1], [x, 2]]");
+ }
+
+ @Test
+ public void multi012() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("she", "his", "hers");
+ List<Match> matches = m.find("dddhiheddd", 0);
+ assertEquals("Expected no match", 0, matches.size());
+ }
+
+ @Test
+ public void multi013() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers");
+ List<Match> matches = m.find("dddhisheddd", 4);
+ testList(matches, "[[she, 5], [he, 6]]");
+ }
+
+ @Test
+ public void multi014() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.create("x", "xx", "xxx");
+ List<Match> matches = m.find("xxx", 2);
+ testList(matches, "[[x, 2]]");
+ }
+
+ @Test
+ public void noStrings001() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.builder().build();
+ assertNull("Expected no match", m.indexOf("dhihedd", 0));
+ List<Match> matches = m.find("dddhiheddd", 0);
+ assertEquals("Expected no match", 0, matches.size());
+ }
+
+ @Test
+ public void noStrings002() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.builder().add("").build();
+ assertNull("Expected no match", m.indexOf("dhihedd", 0));
+ List<Match> matches = m.find("dddhiheddd", 0);
+ assertEquals("Expected no match", 0, matches.size());
+ }
+
+ @Test
+ public void noStrings003() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.builder().add((String[]) null).build();
+ assertNull("Expected no match", m.indexOf("dhihedd", 0));
+ List<Match> matches = m.find("dddhiheddd", 0);
+ assertEquals("Expected no match", 0, matches.size());
+ }
+
+ @Test
+ public void fluent001() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.builder().add("he", "she", "his", "hers").build();
+ test(m.indexOf("ushers", 0), "she", 1);
+ }
+
+ @Test
+ public void fluent002() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.builder().add("he", "she").add("his", "hers").build();
+ testList(m.find("ushers", 0), "[[she, 1], [he, 2], [hers, 2]]");
+ }
+
+ @Test
+ public void fluent003() throws Exception {
+ MultiStringMatcher m = MultiStringMatcher.builder().add("he").add(null, "she", "").add("his", "hers").build();
+ testList(m.find("ushers", 0), "[[she, 1], [he, 2], [hers, 2]]");
+ }
+
+ @Test
+ public void addAfterBuild() throws Exception {
+ MultiStringMatcher.Builder b = MultiStringMatcher.builder().add("he", "she").add("his", "hers");
+ b.build();
+ thrown.expect(IllegalStateException.class);
+ b.add("us");
+ }
+
+ @Test
+ public void reuseBuilder() throws Exception {
+ MultiStringMatcher.Builder b = MultiStringMatcher.builder().add("he", "she").add("his", "hers");
+ b.build();
+ thrown.expect(IllegalStateException.class);
+ b.build();
+ }
+}
diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java
index 7504167e8..3a0cc75cc 100644
--- a/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java
+++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java
@@ -13,7 +13,8 @@
*******************************************************************************/
package org.eclipse.text.tests;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import java.util.ArrayList;
import java.util.Iterator;
diff --git a/org.eclipse.text/META-INF/MANIFEST.MF b/org.eclipse.text/META-INF/MANIFEST.MF
index 82e539f8d..4ee82ebb2 100644
--- a/org.eclipse.text/META-INF/MANIFEST.MF
+++ b/org.eclipse.text/META-INF/MANIFEST.MF
@@ -2,7 +2,7 @@ Manifest-Version: 1.0
Bundle-ManifestVersion: 2
Bundle-Name: %pluginName
Bundle-SymbolicName: org.eclipse.text
-Bundle-Version: 3.8.300.qualifier
+Bundle-Version: 3.9.0.qualifier
Bundle-Vendor: %providerName
Bundle-Localization: plugin
Export-Package:
diff --git a/org.eclipse.text/pom.xml b/org.eclipse.text/pom.xml
index 1855055d5..41e06eee8 100644
--- a/org.eclipse.text/pom.xml
+++ b/org.eclipse.text/pom.xml
@@ -18,6 +18,6 @@
</parent>
<groupId>org.eclipse.text</groupId>
<artifactId>org.eclipse.text</artifactId>
- <version>3.8.300-SNAPSHOT</version>
+ <version>3.9.0-SNAPSHOT</version>
<packaging>eclipse-plugin</packaging>
</project>
diff --git a/org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java b/org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java
index 75b8a932d..a9373d014 100644
--- a/org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java
+++ b/org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java
@@ -1,5 +1,5 @@
/*******************************************************************************
- * Copyright (c) 2000, 2008 IBM Corporation and others.
+ * Copyright (c) 2000, 2019 IBM Corporation and others.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
@@ -10,11 +10,14 @@
*
* Contributors:
* IBM Corporation - initial API and implementation
+ * Thomas Wolf - Bug 545252: improved search performance for multiple delimiters
*******************************************************************************/
package org.eclipse.jface.text;
import org.eclipse.core.runtime.Assert;
+import org.eclipse.jface.text.MultiStringMatcher.Match;
+
/**
* Standard implementation of a generic
@@ -31,12 +34,12 @@ import org.eclipse.core.runtime.Assert;
*/
public class ConfigurableLineTracker extends AbstractLineTracker {
-
/** The strings which are considered being the line delimiter */
- private String[] fDelimiters;
+ private final String[] fDelimiters;
/** A predefined delimiter information which is always reused as return value */
- private DelimiterInfo fDelimiterInfo= new DelimiterInfo();
-
+ private final DelimiterInfo fDelimiterInfo= new DelimiterInfo();
+ /** Util to search the configured line delimiters in text. <code>null</code> if only one delimiter is used. */
+ private final MultiStringMatcher fMatcher;
/**
* Creates a standard line tracker for the given line delimiters.
@@ -47,6 +50,7 @@ public class ConfigurableLineTracker extends AbstractLineTracker {
public ConfigurableLineTracker(String[] legalLineDelimiters) {
Assert.isTrue(legalLineDelimiters != null && legalLineDelimiters.length > 0);
fDelimiters= TextUtilities.copy(legalLineDelimiters);
+ fMatcher= legalLineDelimiters.length > 1 ? MultiStringMatcher.create(legalLineDelimiters) : null;
}
@Override
@@ -56,12 +60,13 @@ public class ConfigurableLineTracker extends AbstractLineTracker {
@Override
protected DelimiterInfo nextDelimiterInfo(String text, int offset) {
- if (fDelimiters.length > 1) {
- int[] info= TextUtilities.indexOf(fDelimiters, text, offset);
- if (info[0] == -1)
+ if (fMatcher != null) {
+ Match m = fMatcher.indexOf(text, offset);
+ if (m == null) {
return null;
- fDelimiterInfo.delimiterIndex= info[0];
- fDelimiterInfo.delimiter= fDelimiters[info[1]];
+ }
+ fDelimiterInfo.delimiterIndex= m.getOffset();
+ fDelimiterInfo.delimiter= m.getText();
} else {
int index= text.indexOf(fDelimiters[0], offset);
if (index == -1)
diff --git a/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java b/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java
new file mode 100644
index 000000000..daef7f70a
--- /dev/null
+++ b/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java
@@ -0,0 +1,430 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Paul Pazderski, Thomas Wolf, and others.
+ *
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Paul Pazderski; Thomas Wolf - initial API and implementation
+ *******************************************************************************/
+package org.eclipse.jface.text;
+
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.stream.Collectors;
+
+/**
+ * Fast matcher to find the occurrences of any of a fixed set of constant strings. Supports finding
+ * all (possibly overlapping) matches, or only the leftmost longest match.
+ *
+ * @since 3.9
+ */
+public class MultiStringMatcher {
+
+ // An implementation of the Aho-Corasick algorithm (without the DFA construction from section 6 of the
+ // paper; just the failure and output links).
+ //
+ // See Aho, Alfred A.; Corasick, Margaret J.: "Efficient String Matching: An Aid to Bibliographic Search",
+ // CACM 18(6), 1975.
+ //
+ // The algorithm has been modified to support both reporting all matches or only leftmost longest matches.
+
+ /**
+ * Describes a match result of {@link MultiStringMatcher#indexOf(CharSequence, int)}, giving
+ * access to the matched string and the offset in the text it was matched at.
+ */
+ public static interface Match {
+
+ /**
+ * Obtains the matched string.
+ *
+ * @return the text matched
+ */
+ String getText();
+
+ /**
+ * Obtains the offset the {@link #getText() text} was matched at.
+ *
+ * @return the offset
+ */
+ int getOffset();
+
+ }
+
+ /** A Builder for creating a {@link MultiStringMatcher}. */
+ public static interface Builder {
+
+ /**
+ * Adds search strings to be looked for. {@code null} and empty strings in the arguments are
+ * ignored.
+ *
+ * @param searchStrings to add to be looked for by the matcher.
+ * @return this
+ * @throws IllegalStateException if the {@link MultiStringMatcher} was already built.
+ */
+ Builder add(String... searchStrings);
+
+ /**
+ * Returns the {@link MultiStringMatcher} built by this builder.
+ * <p>
+ * Note that a {@link Builder} instance can build only <em>one</em>
+ * {@link MultiStringMatcher} instance. This is by design; otherwise the builder would have
+ * to store all the searchStrings somewhere, which may be rather memory intensive if a lot
+ * of search strings are added.
+ * </p>
+ *
+ * @return the {@link MultiStringMatcher}
+ * @throws IllegalStateException if the {@link MultiStringMatcher} was already built.
+ */
+ MultiStringMatcher build();
+ }
+
+ private static class BuilderImpl implements Builder {
+
+ private MultiStringMatcher m;
+
+ BuilderImpl() {
+ m= new MultiStringMatcher();
+ }
+
+ private void check() {
+ if (m == null) {
+ throw new IllegalStateException("Builder.build() was already called"); //$NON-NLS-1$
+ }
+ }
+
+ @Override
+ public Builder add(String... searchStrings) {
+ check();
+ m.add(searchStrings);
+ return this;
+ }
+
+ @Override
+ public MultiStringMatcher build() {
+ check();
+ MultiStringMatcher result= m;
+ m= null;
+ if (!result.root.hasChildren()) {
+ // no search strings were added; return a specialized "matches nothing" matcher
+ return new MultiStringMatcher() {
+ @Override
+ public List<Match> find(CharSequence text, int offset) {
+ return new LinkedList<>();
+ }
+
+ @Override
+ public Match indexOf(CharSequence text, int offset) {
+ return null;
+ }
+ };
+ }
+ result.buildLinks();
+ return result;
+ }
+ }
+
+ /**
+ * Creates an initially empty {@link Builder}.
+ *
+ * @return the {@link Builder}
+ */
+ public static Builder builder() {
+ return new BuilderImpl();
+ }
+
+ private static class MatchResult implements Match {
+
+ private final String match;
+
+ private final int offset;
+
+ public MatchResult(String match, int offset) {
+ this.match= match;
+ this.offset= offset;
+ }
+
+ @Override
+ public String getText() {
+ return match;
+ }
+
+ @Override
+ public int getOffset() {
+ return offset;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hashCode(match) * 31 + Integer.hashCode(offset);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ MatchResult other= (MatchResult) obj;
+ return offset == other.offset && Objects.equals(match, other.match);
+ }
+
+ @Override
+ public String toString() {
+ return '[' + match + ", " + offset + ']'; //$NON-NLS-1$
+ }
+ }
+
+ /** A node in the trie built from the search strings. */
+ private static class Node {
+ HashMap<Character, Node> children;
+
+ String match;
+
+ Node fail;
+
+ Node output;
+
+ Node next(Character c) {
+ return children == null ? null : children.get(c);
+ }
+
+ Node add(char c) {
+ if (children == null) {
+ children= new HashMap<>();
+ }
+ return children.computeIfAbsent(Character.valueOf(c), key -> new Node());
+ }
+
+ boolean hasChildren() {
+ return children != null;
+ }
+
+ @Override
+ public String toString() {
+ return "Match: " + (match == null ? "null" : '>' + match + '<') //$NON-NLS-1$ //$NON-NLS-2$
+ + " Children: " + (children == null ? "<none>" : children.keySet().stream().map(c -> c.toString()).collect(Collectors.joining(", "))); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
+ }
+ }
+
+ /** Root node of the trie. */
+ private final Node root= new Node() {
+ @Override
+ Node next(Character c) {
+ // Implements the sentinel loop on the root node for all non-matching characters.
+ Node child= super.next(c);
+ return child == null ? this : child;
+ }
+ };
+
+ private MultiStringMatcher() {
+ // Always use a Builder or the static helper methods to create a MultiStringMatcher
+ }
+
+ private void add(String... searchStrings) {
+ if (searchStrings != null) {
+ for (String searchString : searchStrings) {
+ if (searchString == null || searchString.isEmpty()) {
+ continue;
+ }
+ Node node= root;
+ for (char c : searchString.toCharArray()) {
+ node= node.add(c);
+ }
+ node.match= searchString;
+ }
+ }
+ }
+
+ private void buildLinks() {
+ // Build the fail and output links. See the paper referenced at the top; this
+ // is a one-to-one implementation of the original algorithm. Variable names
+ // s, r, and state are kept as in the paper.
+ List<Node> queue= new LinkedList<>();
+ for (Node s : root.children.values()) {
+ if (s.children != null) {
+ // No need to queue nodes without children since we don't do anything
+ // with them anyway.
+ queue.add(s);
+ }
+ s.fail= root;
+ }
+ while (!queue.isEmpty()) {
+ Node r= queue.remove(0);
+ for (Map.Entry<Character, Node> entry : r.children.entrySet()) {
+ Character c= entry.getKey();
+ Node s= entry.getValue();
+ if (s.children != null) {
+ queue.add(s);
+ }
+ Node state= r.fail;
+ Node f;
+ while ((f= state.next(c)) == null) {
+ state= state.fail;
+ }
+ s.fail= f;
+ if (f.match != null) {
+ s.output= f;
+ } else if (f.output != null) {
+ s.output= f.output;
+ }
+ }
+ }
+ }
+
+ /**
+ * Find the next occurrence of any of the search strings of the {@link MultiStringMatcher} in
+ * the given {@code text} starting at the given {@code offset}.
+ * <p>
+ * Performs a simultaneous search for all the strings, returning the leftmost match. If multiple
+ * search strings match at the same index, the longest match is returned.
+ * </p>
+ *
+ * @param text to search (not {@code null})
+ * @param offset to start searching at
+ * @return the leftmost longest match found, or {@code null} if no match was found.
+ */
+ public Match indexOf(CharSequence text, int offset) {
+ // Main search loop of the Aho-Corasick algorithm, modified to stop after
+ // the leftmost longest match.
+ // How it works: primary goal -> lowest offset, secondary goal -> longest match.
+ // We differ between primary and sub matches. When the first match a character we walk down
+ // a path on the trie to find a match. Any match we found on this way is a primary match and
+ // any new primary match is better than the one before. Also a primary match always got a better
+ // offset than any sub or failover match.
+ // A sub match is a sub pattern of our main trie path. There offset is always after any of the
+ // primaries. Sub matches are not found in offset order so we must check if a new sub match is
+ // better than our best so far.
+ // If we fell off the trie path but found a primary match we are done. No other match can get a lower
+ // offset. Sometimes we can continue on another trie path through the fail link. If we found no
+ // failover path we can return our sub match or must restart at root. If we changed to a
+ // failover path we must check on the first direct match if a sub match from the previous path
+ // is better. If yes it will be returned if not we continue like we did on the first path.
+ int textEnd= text.length();
+ Match primaryMatch= null;
+ Match subMatch= null;
+ boolean failover= false;
+ Node node= root;
+ for (int i= offset; i < textEnd; i++) {
+ Character c= Character.valueOf(text.charAt(i));
+ Node next= node.next(c);
+ if (next == null) {
+ // Fell off the trie.
+ if (primaryMatch != null) {
+ // Return primary match because any other match must have a higher offset.
+ return primaryMatch;
+ }
+ // Search for other trie to change to.
+ do {
+ node= node.fail;
+ } while ((next= node.next(c)) == null);
+ failover= (node != root);
+ if (!failover && subMatch != null) {
+ // We fell of the trie and could not switch to another. Return best sub match if possible.
+ return subMatch;
+ }
+ }
+ node= next;
+ if (node.match != null) {
+ int newOffset= i - node.match.length() + 1;
+ // On a failover trie the sub match can be better.
+ // And if it is return it because nothing better will follow.
+ if (failover && subMatch != null && subMatch.getOffset() < newOffset) {
+ return subMatch;
+ }
+ // Any new primary match is better because all have the same offset but any new must be longer.
+ primaryMatch= new MatchResult(node.match, newOffset);
+ if (!node.hasChildren()) {
+ // We will fall off the trie on next character so we can return right here
+ return primaryMatch;
+ }
+ }
+ // Check for sub matches but only if there is no primary match because only another primary match can be better.
+ if (primaryMatch == null) {
+ Node out= node.output;
+ if (out != null) {
+ int newOffset= i - out.match.length() + 1;
+ if (subMatch == null
+ || newOffset < subMatch.getOffset()
+ || (newOffset == subMatch.getOffset() && out.match.length() > subMatch.getText().length())) {
+ subMatch= new MatchResult(out.match, newOffset);
+ }
+ }
+ }
+ }
+ return primaryMatch != null ? primaryMatch : subMatch;
+ }
+
+ /**
+ * Finds all occurrences of any of the search strings of the {@link MultiStringMatcher} in the
+ * given {@code text} starting at the given {@code offset}, including overlapping occurrences.
+ *
+ * @param text to search (not {@code null})
+ * @param offset to start searching at
+ * @return a possibly empty list of matches
+ */
+ public List<Match> find(CharSequence text, int offset) {
+ // Main search loop of the standard Aho-Corasick algorithm.
+ int textEnd= text.length();
+ List<Match> matches= new LinkedList<>();
+ Node node= root;
+ for (int i= offset; i < textEnd; i++) {
+ Character c= Character.valueOf(text.charAt(i));
+ Node next;
+ while ((next= node.next(c)) == null) {
+ node= node.fail;
+ }
+ node= next;
+ if (node.match != null) {
+ matches.add(new MatchResult(node.match, i - node.match.length() + 1));
+ }
+ Node out= node.output;
+ while (out != null) {
+ matches.add(new MatchResult(out.match, i - out.match.length() + 1));
+ out= out.output;
+ }
+ }
+ return matches;
+ }
+
+ /**
+ * Finds the leftmost longest occurrence of any of the given {@code searchStrings} in the
+ * {@code text} starting at the given {@code offset}.
+ * <p>
+ * To match the same set of search strings repeatedly against texts it is more efficient to
+ * build and re-use a {@link MultiStringMatcher}.
+ * </p>
+ *
+ * @param text to search (not {@code null})
+ * @param offset to start searching at
+ * @param searchStrings to look for; non-{@code null} and non-empty strings are ignored
+ * @return a {@link Match} describing the match found, or {@code null} if no match was found or
+ * there are no non-{@code null} non-empty {@code searchStrings}
+ */
+ public static Match indexOf(CharSequence text, int offset, String... searchStrings) {
+ return create(searchStrings).indexOf(text, offset);
+ }
+
+ /**
+ * Creates a {@link MultiStringMatcher} for the given {@code searchStrings}.
+ * <p>
+ * If there are no non-{@code null} non-empty {@code searchStrings}, the returned
+ * {@link MultiStringMatcher} will never match anything.
+ * </p>
+ *
+ * @param searchStrings to look for; non-{@code null} and non-empty strings are ignored
+ * @return the {@link MultiStringMatcher}
+ */
+ public static MultiStringMatcher create(String... searchStrings) {
+ return builder().add(searchStrings).build();
+ }
+}

Back to the top