diff options
author | Thomas Wolf | 2019-08-17 10:53:30 +0000 |
---|---|---|
committer | Paul Pazderski | 2019-08-24 12:58:34 +0000 |
commit | e50ee43b2cf76c97c833fe400539c00f9ebe8fc7 (patch) | |
tree | 9e1c5b8cd936580c9e2e1a698f58e662c343937b /org.eclipse.text.tests | |
parent | c58c96bcb725d38350343f31c7a8988853f19709 (diff) | |
download | eclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.tar.gz eclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.tar.xz eclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.zip |
Bug 545252 - ConfigurableLineTracker: use the Aho-Corasick algorithmI20190826-0640I20190826-0415
The ConfigurableLineTracker had quadratic performance characteristics,
making line-end detection even only with the standard line terminators
extremely slow already on moderately sized sources.
Add an efficient MultiStringMatcher using the Aho-Corasick algorithm
to efficiently find any of a fixed set of strings in a text. The
algorithm is a well-described efficient general-purpose multi-string
matching algorithm based on a trie. Its main search loop is very
simple; the more tricky bits are in the construction of the auxiliary
links in the trie. The algorithm as described in 1975 returns all
matches including overlaps within a text, but it can be adapted fairly
easily to support the "leftmost longest match" semantics needed by
ConfigurableLineTracker.
The Aho-Corasick algorithm has a complexity of O(n + m + z),
where n is the text length, m the sum of the length of patterns,
and z the number of matches. For line end matching, this is roughly
O(n + L), where L is the number of lines.
Note that it can produce a quadratic number of matches on degenerate
input (patterns "x", "xx", "xxx", and so on, and a text consisting
only of "x"s).
Adapt ConfigurableLineTracker to use the new MultiStringMatcher. The
matcher provides a find() operation to find all (possibly overlapping)
occurrences of the pattern strings; this is the normal way the
Aho-Corasick algorithm works. An indexOf() method provides a way to
efficiently find only the leftmost longest match.
Since this new matcher is generally useful and fast, provide a builder
interface through which pattern strings can be added iteratively
before matching.
Change-Id: I3e66482cfca271248caaad79157f843bc2c819d7
Signed-off-by: Thomas Wolf <thomas.wolf@paranor.ch>
Also-by: Paul Pazderski <paul-eclipse@ppazderski.de>
Diffstat (limited to 'org.eclipse.text.tests')
4 files changed, 552 insertions, 1 deletions
diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java new file mode 100644 index 00000000000..fe6f597485f --- /dev/null +++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java @@ -0,0 +1,96 @@ +/******************************************************************************* + * Copyright (c) 2019 Thomas Wolf and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + *******************************************************************************/ +package org.eclipse.text.tests; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import org.eclipse.jface.text.ConfigurableLineTracker; +import org.eclipse.jface.text.DefaultLineTracker; +import org.eclipse.jface.text.GapTextStore; + +public class ConfigurableLineTrackerTest extends AbstractLineTrackerTest { + + @Before + public void setUp() { + fText= new GapTextStore(); + } + + @After + public void tearDown() { + fTracker= null; + fText= null; + } + + /** + * Expected length of the tracked delimiter. First entry is length of first + * delimiter and so on. + */ + private int[] lineEndLengths; + + @Override + protected int getLineOffset(int line, int[] lines) { + int offset= 0; + for (int i= 0; i < line; i++) { + offset += (lines[i] + lineEndLengths[i]); + } + return offset; + } + + private void setLegalDelimiters(String... delimiters) { + fTracker = new ConfigurableLineTracker(delimiters); + } + + @Test + public void testLongestMatch() throws Exception { + setLegalDelimiters(DefaultLineTracker.DELIMITERS); + set("xxxx\r\nxx"); + lineEndLengths = new int[] {2}; + checkLines(new int[] {4, 2}); + } + + @Test + public void testMixedDefaultDelimiter() throws Exception { + setLegalDelimiters(DefaultLineTracker.DELIMITERS); + set("1234\r\n123\r1234\n1234567\r"); + lineEndLengths = new int[] { 2, 1, 1, 1 }; + checkLines(new int[] { 4, 3, 4, 7, 0 }); + set("first\r\r\nthird"); + lineEndLengths = new int[] { 1, 2 }; + checkLines(new int[] { 5, 0, 5 }); + } + + @Test + public void testSingleDelimiter() throws Exception { + setLegalDelimiters("\n"); + set("xxxx\nxx"); + lineEndLengths = new int[] { 1 }; + checkLines(new int[] { 4, 2 }); + } + + @Test + public void testAlternativeDelimiters1() throws Exception { + // create a line tracker with "show whitespace" characters as delimiter + setLegalDelimiters("\u00a4", "\u00b6", "\u00a4\u00b6"); + set("xxxx\u00a4\u00b6xx"); + lineEndLengths = new int[] { 2 }; + checkLines(new int[] { 4, 2 }); + } + + @Test + public void testAlternativeDelimiters2() throws Exception { + setLegalDelimiters("{NewLine}", "[NewLine]"); + set("A line{NewLine}AnotherLine[NewLine}A third line[newline]End"); + lineEndLengths = new int[] { 9 }; + checkLines(new int[] { 6, 44 }); + } +} diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java index 8e2650685f6..be13f581481 100644 --- a/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java +++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java @@ -29,6 +29,8 @@ import org.eclipse.text.tests.templates.TemplatesTestSuite; */ @RunWith(Suite.class) @SuiteClasses({ + MultiStringMatcherTest.class, + ConfigurableLineTrackerTest.class, LineTrackerTest4.class, DocumentExtensionTest.class, LineTrackerTest3.class, diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java new file mode 100644 index 00000000000..b50f57725ef --- /dev/null +++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java @@ -0,0 +1,452 @@ +/******************************************************************************* + * Copyright (c) 2019 Thomas Wolf and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + *******************************************************************************/ +package org.eclipse.text.tests; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +import java.util.Collections; +import java.util.List; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import org.eclipse.jface.text.MultiStringMatcher; +import org.eclipse.jface.text.MultiStringMatcher.Match; + +public class MultiStringMatcherTest { + + @Rule + public ExpectedException thrown = ExpectedException.none(); + + private static Match run(String text, String... needles) { + return MultiStringMatcher.indexOf(text, 0, needles); + } + + private static Match run(String text, int offset, String... needles) { + return MultiStringMatcher.indexOf(text, offset, needles); + } + + private static void test(Match m, String expected, int index) { + assertNotNull("No match", m); + assertEquals("Unexpected match", expected, m.getText()); + assertEquals("Unexpected index", index, m.getOffset()); + } + + private static void testList(List<Match> matches, String expected) { + Collections.sort(matches, (a, b) -> { + int cmp = Integer.compare(a.getOffset(), b.getOffset()); + if (cmp != 0) { + return cmp; + } + return Integer.compare(a.getText().length(), b.getText().length()); + }); + assertEquals("Unexpected results", expected, matches.toString()); + } + + @Test + public void test001() throws Exception { + Match m = run("dcccacabcccabcc", "ab", "cab"); + test(m, "cab", 5); + } + + @Test + public void test002() throws Exception { + Match m = run("dcccacabcccabcc", "ab", "abc"); + test(m, "abc", 6); + } + + @Test + public void test003() throws Exception { + Match m = run("dcccacabcccabcc", "ab", "cxb"); + test(m, "ab", 6); + } + + @Test + public void test004() throws Exception { + Match m = run("dcccacabcccabcc", "abc", "cabx"); + test(m, "abc", 6); + } + + @Test + public void test005() throws Exception { + Match m = run("dacabddd", "ac", "cab"); + test(m, "ac", 1); + } + + @Test + public void test006() throws Exception { + Match m = run("dacabddd", "aca", "cab"); + test(m, "aca", 1); + } + + @Test + public void test007() throws Exception { + Match m = run("dacabddd", "acab", "cab"); + test(m, "acab", 1); + } + + @Test + public void test008() throws Exception { + Match m = run("ddddddcac", "ac", "cab"); + test(m, "ac", 7); + } + + @Test + public void test009() throws Exception { + Match m = run("dddddcacddd", "cacx", "ac"); + test(m, "ac", 6); + } + + @Test + public void test010() throws Exception { + Match m = run("ddddddcac", "ac", "cac"); + test(m, "cac", 6); + } + + @Test + public void test011() throws Exception { + Match m = run("a", "a", "b", "ab"); + test(m, "a", 0); + } + + @Test + public void test012() throws Exception { + Match m = run("b", "a", "b", "ab"); + test(m, "b", 0); + } + + @Test + public void test013() throws Exception { + Match m = run("ab", "a", "b", "ab"); + test(m, "ab", 0); + } + + @Test + public void test014() throws Exception { + Match m = run("", "a", "b", "ab"); + assertNull("Expected no match", m); + } + + @Test + public void test015() throws Exception { + Match m = run("dddca", "ac", "cac", "ab"); + assertNull("Expected no match", m); + } + + @Test + public void test016() throws Exception { + Match m = run("ab", "ab", "b"); + test(m, "ab", 0); + } + + @Test + public void test017() throws Exception { + Match m = run("ushers", "he", "she", "his", "hers"); + test(m, "she", 1); + } + + @Test + public void test018() throws Exception { + Match m = run("dddhisheddd", "he", "she", "his", "hers"); + test(m, "his", 3); + } + + @Test + public void test019() throws Exception { + Match m = run("sotat", "at", "art", "oars", "soar"); + test(m, "at", 3); + } + + @Test + public void test020() throws Exception { + Match m = run("xxx", "x", "xx", "xxx"); + test(m, "xxx", 0); + } + + @Test + public void test021() throws Exception { + Match m = run("xx", "x", "xx", "xxx"); + test(m, "xx", 0); + } + + @Test + public void test022() throws Exception { + Match m = run("x", "x", "xx", "xxx"); + test(m, "x", 0); + } + + @Test + public void test023() throws Exception { + Match m = run("Lorem\r\nIpsum", "\n", "\r\n", "\r"); + test(m, "\r\n", 5); + } + + @Test + public void test024() throws Exception { + Match m = run("dcccacabcccabcc", "ab", "abcd"); + test(m, "ab", 6); + } + + @Test + public void test025() throws Exception { + Match m = run("dcccacabcccabcc", "abcd", "bccc"); + test(m, "bccc", 7); + } + + @Test + public void test026() throws Exception { + Match m = run("xxx", 1, "x", "xx", "xxx"); + test(m, "xx", 1); + } + + @Test + public void test027() throws Exception { + Match m = run("xxx", 2, "x", "xx", "xxx"); + test(m, "x", 2); + } + + @Test + public void test028() throws Exception { + Match m = run("dddhisheddd", 7, "he", "she", "his", "hers"); + assertNull("Expected no match", m); + } + + @Test + public void test029() throws Exception { + Match m = run("Lorem001Ipsum", "1", "01", "0"); + test(m, "0", 5); + } + + @Test + public void test030() throws Exception { + Match m = run("Lorem01Ipsum", "1", "01", "0"); + test(m, "01", 5); + } + + @Test + public void test031() throws Exception { + Match m = run("ddcababababababcabxdd", "ca", "cabx", "ababc"); + test(m, "ca", 2); + } + + @Test + public void test032() throws Exception { + Match m = run("ddcababcabxdd", "ca", "cabx", "cababc", "ababc"); + test(m, "cababc", 2); + } + + @Test + public void test033() throws Exception { + Match m = run("ddcababcdd", "ca", "cabx", "cababc", "ababc"); + test(m, "cababc", 2); + } + + @Test + public void test034() throws Exception { + Match m = run("ddcababcdd", "cababx", "ababc"); + test(m, "ababc", 3); + } + + @Test + public void test035() throws Exception { + Match m = run("ddcababcdd", "cababx", "abab", "ababc"); + test(m, "ababc", 3); + } + + @Test + public void test036() throws Exception { + Match m = run("ddcabababcdd", "ab", "abab", "cabababcy", "cabababcyy"); + test(m, "abab", 3); + } + + @Test + public void test037() throws Exception { + Match m = run("ddcabababcdd", "ab", "abab", "cabababc", "cabababcyy"); + test(m, "cabababc", 2); + } + + @Test + public void test038() throws Exception { + Match m = run("ddcabababcdd", "ab", "bababcd", "cabababcy", "cabababcyy"); + test(m, "ab", 3); + } + + @Test + public void test039() throws Exception { + Match m = run("ddcabababcddf", "bababcd", "bababcddf", "cabababcy", "cabababcyy"); + test(m, "bababcddf", 4); + } + + @Test + public void test040() throws Exception { + Match m = run("ddcabababcddffxx", "ffxx", "bababcd", "bababcddffy", "cabababcy", "cabababcyy"); + test(m, "bababcd", 4); + } + + @Test + public void multi001() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers"); + List<Match> matches = m.find("ushers", 0); + testList(matches, "[[she, 1], [he, 2], [hers, 2]]"); + } + + @Test + public void multi002() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers"); + List<Match> matches = m.find("dddhisheddd", 0); + testList(matches, "[[his, 3], [she, 5], [he, 6]]"); + } + + @Test + public void multi003() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "sh", "is"); + List<Match> matches = m.find("dddhisheddd", 0); + testList(matches, "[[his, 3], [is, 4], [sh, 5], [she, 5], [he, 6]]"); + } + + @Test + public void multi004() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("ab", "about", "at", "ate", "be", "bed", "edge", "get"); + List<Match> matches = m.find("abedgetab", 0); + testList(matches, "[[ab, 0], [be, 1], [bed, 1], [edge, 2], [get, 4], [ab, 7]]"); + } + + @Test + public void multi005() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar"); + List<Match> matches = m.find("soars", 0); + testList(matches, "[[soar, 0], [oars, 1]]"); + } + + @Test + public void multi006() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar"); + List<Match> matches = m.find("oart", 0); + testList(matches, "[[art, 1]]"); + } + + @Test + public void multi007() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar"); + List<Match> matches = m.find("soarsoars", 0); + testList(matches, "[[soar, 0], [oars, 1], [soar, 4], [oars, 5]]"); + } + + @Test + public void multi008() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar"); + List<Match> matches = m.find("sotat", 0); + testList(matches, "[[at, 3]]"); + } + + @Test + public void multi009() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("i", "in", "tin", "sting"); + List<Match> matches = m.find("sting", 0); + testList(matches, "[[sting, 0], [tin, 1], [i, 2], [in, 2]]"); + } + + @Test + public void multi010() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("i", "in", "tin", "sting", "hastings"); + List<Match> matches = m.find("hastings", 0); + testList(matches, "[[hastings, 0], [sting, 2], [tin, 3], [i, 4], [in, 4]]"); + } + + @Test + public void multi011() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("x", "xx", "xxx"); + List<Match> matches = m.find("xxx", 0); + testList(matches, "[[x, 0], [xx, 0], [xxx, 0], [x, 1], [xx, 1], [x, 2]]"); + } + + @Test + public void multi012() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("she", "his", "hers"); + List<Match> matches = m.find("dddhiheddd", 0); + assertEquals("Expected no match", 0, matches.size()); + } + + @Test + public void multi013() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers"); + List<Match> matches = m.find("dddhisheddd", 4); + testList(matches, "[[she, 5], [he, 6]]"); + } + + @Test + public void multi014() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("x", "xx", "xxx"); + List<Match> matches = m.find("xxx", 2); + testList(matches, "[[x, 2]]"); + } + + @Test + public void noStrings001() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().build(); + assertNull("Expected no match", m.indexOf("dhihedd", 0)); + List<Match> matches = m.find("dddhiheddd", 0); + assertEquals("Expected no match", 0, matches.size()); + } + + @Test + public void noStrings002() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add("").build(); + assertNull("Expected no match", m.indexOf("dhihedd", 0)); + List<Match> matches = m.find("dddhiheddd", 0); + assertEquals("Expected no match", 0, matches.size()); + } + + @Test + public void noStrings003() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add((String[]) null).build(); + assertNull("Expected no match", m.indexOf("dhihedd", 0)); + List<Match> matches = m.find("dddhiheddd", 0); + assertEquals("Expected no match", 0, matches.size()); + } + + @Test + public void fluent001() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add("he", "she", "his", "hers").build(); + test(m.indexOf("ushers", 0), "she", 1); + } + + @Test + public void fluent002() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add("he", "she").add("his", "hers").build(); + testList(m.find("ushers", 0), "[[she, 1], [he, 2], [hers, 2]]"); + } + + @Test + public void fluent003() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add("he").add(null, "she", "").add("his", "hers").build(); + testList(m.find("ushers", 0), "[[she, 1], [he, 2], [hers, 2]]"); + } + + @Test + public void addAfterBuild() throws Exception { + MultiStringMatcher.Builder b = MultiStringMatcher.builder().add("he", "she").add("his", "hers"); + b.build(); + thrown.expect(IllegalStateException.class); + b.add("us"); + } + + @Test + public void reuseBuilder() throws Exception { + MultiStringMatcher.Builder b = MultiStringMatcher.builder().add("he", "she").add("his", "hers"); + b.build(); + thrown.expect(IllegalStateException.class); + b.build(); + } +} diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java index 7504167e805..3a0cc75cc47 100644 --- a/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java +++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java @@ -13,7 +13,8 @@ *******************************************************************************/ package org.eclipse.text.tests; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import java.util.ArrayList; import java.util.Iterator; |