diff options
author | Thomas Wolf | 2019-08-17 10:53:30 +0000 |
---|---|---|
committer | Paul Pazderski | 2019-08-24 12:58:34 +0000 |
commit | e50ee43b2cf76c97c833fe400539c00f9ebe8fc7 (patch) | |
tree | 9e1c5b8cd936580c9e2e1a698f58e662c343937b | |
parent | c58c96bcb725d38350343f31c7a8988853f19709 (diff) | |
download | eclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.tar.gz eclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.tar.xz eclipse.platform.text-e50ee43b2cf76c97c833fe400539c00f9ebe8fc7.zip |
Bug 545252 - ConfigurableLineTracker: use the Aho-Corasick algorithmI20190826-0640I20190826-0415
The ConfigurableLineTracker had quadratic performance characteristics,
making line-end detection even only with the standard line terminators
extremely slow already on moderately sized sources.
Add an efficient MultiStringMatcher using the Aho-Corasick algorithm
to efficiently find any of a fixed set of strings in a text. The
algorithm is a well-described efficient general-purpose multi-string
matching algorithm based on a trie. Its main search loop is very
simple; the more tricky bits are in the construction of the auxiliary
links in the trie. The algorithm as described in 1975 returns all
matches including overlaps within a text, but it can be adapted fairly
easily to support the "leftmost longest match" semantics needed by
ConfigurableLineTracker.
The Aho-Corasick algorithm has a complexity of O(n + m + z),
where n is the text length, m the sum of the length of patterns,
and z the number of matches. For line end matching, this is roughly
O(n + L), where L is the number of lines.
Note that it can produce a quadratic number of matches on degenerate
input (patterns "x", "xx", "xxx", and so on, and a text consisting
only of "x"s).
Adapt ConfigurableLineTracker to use the new MultiStringMatcher. The
matcher provides a find() operation to find all (possibly overlapping)
occurrences of the pattern strings; this is the normal way the
Aho-Corasick algorithm works. An indexOf() method provides a way to
efficiently find only the leftmost longest match.
Since this new matcher is generally useful and fast, provide a builder
interface through which pattern strings can be added iteratively
before matching.
Change-Id: I3e66482cfca271248caaad79157f843bc2c819d7
Signed-off-by: Thomas Wolf <thomas.wolf@paranor.ch>
Also-by: Paul Pazderski <paul-eclipse@ppazderski.de>
8 files changed, 999 insertions, 13 deletions
diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java new file mode 100644 index 00000000000..fe6f597485f --- /dev/null +++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/ConfigurableLineTrackerTest.java @@ -0,0 +1,96 @@ +/******************************************************************************* + * Copyright (c) 2019 Thomas Wolf and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + *******************************************************************************/ +package org.eclipse.text.tests; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import org.eclipse.jface.text.ConfigurableLineTracker; +import org.eclipse.jface.text.DefaultLineTracker; +import org.eclipse.jface.text.GapTextStore; + +public class ConfigurableLineTrackerTest extends AbstractLineTrackerTest { + + @Before + public void setUp() { + fText= new GapTextStore(); + } + + @After + public void tearDown() { + fTracker= null; + fText= null; + } + + /** + * Expected length of the tracked delimiter. First entry is length of first + * delimiter and so on. + */ + private int[] lineEndLengths; + + @Override + protected int getLineOffset(int line, int[] lines) { + int offset= 0; + for (int i= 0; i < line; i++) { + offset += (lines[i] + lineEndLengths[i]); + } + return offset; + } + + private void setLegalDelimiters(String... delimiters) { + fTracker = new ConfigurableLineTracker(delimiters); + } + + @Test + public void testLongestMatch() throws Exception { + setLegalDelimiters(DefaultLineTracker.DELIMITERS); + set("xxxx\r\nxx"); + lineEndLengths = new int[] {2}; + checkLines(new int[] {4, 2}); + } + + @Test + public void testMixedDefaultDelimiter() throws Exception { + setLegalDelimiters(DefaultLineTracker.DELIMITERS); + set("1234\r\n123\r1234\n1234567\r"); + lineEndLengths = new int[] { 2, 1, 1, 1 }; + checkLines(new int[] { 4, 3, 4, 7, 0 }); + set("first\r\r\nthird"); + lineEndLengths = new int[] { 1, 2 }; + checkLines(new int[] { 5, 0, 5 }); + } + + @Test + public void testSingleDelimiter() throws Exception { + setLegalDelimiters("\n"); + set("xxxx\nxx"); + lineEndLengths = new int[] { 1 }; + checkLines(new int[] { 4, 2 }); + } + + @Test + public void testAlternativeDelimiters1() throws Exception { + // create a line tracker with "show whitespace" characters as delimiter + setLegalDelimiters("\u00a4", "\u00b6", "\u00a4\u00b6"); + set("xxxx\u00a4\u00b6xx"); + lineEndLengths = new int[] { 2 }; + checkLines(new int[] { 4, 2 }); + } + + @Test + public void testAlternativeDelimiters2() throws Exception { + setLegalDelimiters("{NewLine}", "[NewLine]"); + set("A line{NewLine}AnotherLine[NewLine}A third line[newline]End"); + lineEndLengths = new int[] { 9 }; + checkLines(new int[] { 6, 44 }); + } +} diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java index 8e2650685f6..be13f581481 100644 --- a/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java +++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/EclipseTextTestSuite.java @@ -29,6 +29,8 @@ import org.eclipse.text.tests.templates.TemplatesTestSuite; */ @RunWith(Suite.class) @SuiteClasses({ + MultiStringMatcherTest.class, + ConfigurableLineTrackerTest.class, LineTrackerTest4.class, DocumentExtensionTest.class, LineTrackerTest3.class, diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java new file mode 100644 index 00000000000..b50f57725ef --- /dev/null +++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/MultiStringMatcherTest.java @@ -0,0 +1,452 @@ +/******************************************************************************* + * Copyright (c) 2019 Thomas Wolf and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + *******************************************************************************/ +package org.eclipse.text.tests; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +import java.util.Collections; +import java.util.List; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import org.eclipse.jface.text.MultiStringMatcher; +import org.eclipse.jface.text.MultiStringMatcher.Match; + +public class MultiStringMatcherTest { + + @Rule + public ExpectedException thrown = ExpectedException.none(); + + private static Match run(String text, String... needles) { + return MultiStringMatcher.indexOf(text, 0, needles); + } + + private static Match run(String text, int offset, String... needles) { + return MultiStringMatcher.indexOf(text, offset, needles); + } + + private static void test(Match m, String expected, int index) { + assertNotNull("No match", m); + assertEquals("Unexpected match", expected, m.getText()); + assertEquals("Unexpected index", index, m.getOffset()); + } + + private static void testList(List<Match> matches, String expected) { + Collections.sort(matches, (a, b) -> { + int cmp = Integer.compare(a.getOffset(), b.getOffset()); + if (cmp != 0) { + return cmp; + } + return Integer.compare(a.getText().length(), b.getText().length()); + }); + assertEquals("Unexpected results", expected, matches.toString()); + } + + @Test + public void test001() throws Exception { + Match m = run("dcccacabcccabcc", "ab", "cab"); + test(m, "cab", 5); + } + + @Test + public void test002() throws Exception { + Match m = run("dcccacabcccabcc", "ab", "abc"); + test(m, "abc", 6); + } + + @Test + public void test003() throws Exception { + Match m = run("dcccacabcccabcc", "ab", "cxb"); + test(m, "ab", 6); + } + + @Test + public void test004() throws Exception { + Match m = run("dcccacabcccabcc", "abc", "cabx"); + test(m, "abc", 6); + } + + @Test + public void test005() throws Exception { + Match m = run("dacabddd", "ac", "cab"); + test(m, "ac", 1); + } + + @Test + public void test006() throws Exception { + Match m = run("dacabddd", "aca", "cab"); + test(m, "aca", 1); + } + + @Test + public void test007() throws Exception { + Match m = run("dacabddd", "acab", "cab"); + test(m, "acab", 1); + } + + @Test + public void test008() throws Exception { + Match m = run("ddddddcac", "ac", "cab"); + test(m, "ac", 7); + } + + @Test + public void test009() throws Exception { + Match m = run("dddddcacddd", "cacx", "ac"); + test(m, "ac", 6); + } + + @Test + public void test010() throws Exception { + Match m = run("ddddddcac", "ac", "cac"); + test(m, "cac", 6); + } + + @Test + public void test011() throws Exception { + Match m = run("a", "a", "b", "ab"); + test(m, "a", 0); + } + + @Test + public void test012() throws Exception { + Match m = run("b", "a", "b", "ab"); + test(m, "b", 0); + } + + @Test + public void test013() throws Exception { + Match m = run("ab", "a", "b", "ab"); + test(m, "ab", 0); + } + + @Test + public void test014() throws Exception { + Match m = run("", "a", "b", "ab"); + assertNull("Expected no match", m); + } + + @Test + public void test015() throws Exception { + Match m = run("dddca", "ac", "cac", "ab"); + assertNull("Expected no match", m); + } + + @Test + public void test016() throws Exception { + Match m = run("ab", "ab", "b"); + test(m, "ab", 0); + } + + @Test + public void test017() throws Exception { + Match m = run("ushers", "he", "she", "his", "hers"); + test(m, "she", 1); + } + + @Test + public void test018() throws Exception { + Match m = run("dddhisheddd", "he", "she", "his", "hers"); + test(m, "his", 3); + } + + @Test + public void test019() throws Exception { + Match m = run("sotat", "at", "art", "oars", "soar"); + test(m, "at", 3); + } + + @Test + public void test020() throws Exception { + Match m = run("xxx", "x", "xx", "xxx"); + test(m, "xxx", 0); + } + + @Test + public void test021() throws Exception { + Match m = run("xx", "x", "xx", "xxx"); + test(m, "xx", 0); + } + + @Test + public void test022() throws Exception { + Match m = run("x", "x", "xx", "xxx"); + test(m, "x", 0); + } + + @Test + public void test023() throws Exception { + Match m = run("Lorem\r\nIpsum", "\n", "\r\n", "\r"); + test(m, "\r\n", 5); + } + + @Test + public void test024() throws Exception { + Match m = run("dcccacabcccabcc", "ab", "abcd"); + test(m, "ab", 6); + } + + @Test + public void test025() throws Exception { + Match m = run("dcccacabcccabcc", "abcd", "bccc"); + test(m, "bccc", 7); + } + + @Test + public void test026() throws Exception { + Match m = run("xxx", 1, "x", "xx", "xxx"); + test(m, "xx", 1); + } + + @Test + public void test027() throws Exception { + Match m = run("xxx", 2, "x", "xx", "xxx"); + test(m, "x", 2); + } + + @Test + public void test028() throws Exception { + Match m = run("dddhisheddd", 7, "he", "she", "his", "hers"); + assertNull("Expected no match", m); + } + + @Test + public void test029() throws Exception { + Match m = run("Lorem001Ipsum", "1", "01", "0"); + test(m, "0", 5); + } + + @Test + public void test030() throws Exception { + Match m = run("Lorem01Ipsum", "1", "01", "0"); + test(m, "01", 5); + } + + @Test + public void test031() throws Exception { + Match m = run("ddcababababababcabxdd", "ca", "cabx", "ababc"); + test(m, "ca", 2); + } + + @Test + public void test032() throws Exception { + Match m = run("ddcababcabxdd", "ca", "cabx", "cababc", "ababc"); + test(m, "cababc", 2); + } + + @Test + public void test033() throws Exception { + Match m = run("ddcababcdd", "ca", "cabx", "cababc", "ababc"); + test(m, "cababc", 2); + } + + @Test + public void test034() throws Exception { + Match m = run("ddcababcdd", "cababx", "ababc"); + test(m, "ababc", 3); + } + + @Test + public void test035() throws Exception { + Match m = run("ddcababcdd", "cababx", "abab", "ababc"); + test(m, "ababc", 3); + } + + @Test + public void test036() throws Exception { + Match m = run("ddcabababcdd", "ab", "abab", "cabababcy", "cabababcyy"); + test(m, "abab", 3); + } + + @Test + public void test037() throws Exception { + Match m = run("ddcabababcdd", "ab", "abab", "cabababc", "cabababcyy"); + test(m, "cabababc", 2); + } + + @Test + public void test038() throws Exception { + Match m = run("ddcabababcdd", "ab", "bababcd", "cabababcy", "cabababcyy"); + test(m, "ab", 3); + } + + @Test + public void test039() throws Exception { + Match m = run("ddcabababcddf", "bababcd", "bababcddf", "cabababcy", "cabababcyy"); + test(m, "bababcddf", 4); + } + + @Test + public void test040() throws Exception { + Match m = run("ddcabababcddffxx", "ffxx", "bababcd", "bababcddffy", "cabababcy", "cabababcyy"); + test(m, "bababcd", 4); + } + + @Test + public void multi001() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers"); + List<Match> matches = m.find("ushers", 0); + testList(matches, "[[she, 1], [he, 2], [hers, 2]]"); + } + + @Test + public void multi002() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers"); + List<Match> matches = m.find("dddhisheddd", 0); + testList(matches, "[[his, 3], [she, 5], [he, 6]]"); + } + + @Test + public void multi003() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "sh", "is"); + List<Match> matches = m.find("dddhisheddd", 0); + testList(matches, "[[his, 3], [is, 4], [sh, 5], [she, 5], [he, 6]]"); + } + + @Test + public void multi004() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("ab", "about", "at", "ate", "be", "bed", "edge", "get"); + List<Match> matches = m.find("abedgetab", 0); + testList(matches, "[[ab, 0], [be, 1], [bed, 1], [edge, 2], [get, 4], [ab, 7]]"); + } + + @Test + public void multi005() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar"); + List<Match> matches = m.find("soars", 0); + testList(matches, "[[soar, 0], [oars, 1]]"); + } + + @Test + public void multi006() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar"); + List<Match> matches = m.find("oart", 0); + testList(matches, "[[art, 1]]"); + } + + @Test + public void multi007() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar"); + List<Match> matches = m.find("soarsoars", 0); + testList(matches, "[[soar, 0], [oars, 1], [soar, 4], [oars, 5]]"); + } + + @Test + public void multi008() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("at", "art", "oars", "soar"); + List<Match> matches = m.find("sotat", 0); + testList(matches, "[[at, 3]]"); + } + + @Test + public void multi009() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("i", "in", "tin", "sting"); + List<Match> matches = m.find("sting", 0); + testList(matches, "[[sting, 0], [tin, 1], [i, 2], [in, 2]]"); + } + + @Test + public void multi010() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("i", "in", "tin", "sting", "hastings"); + List<Match> matches = m.find("hastings", 0); + testList(matches, "[[hastings, 0], [sting, 2], [tin, 3], [i, 4], [in, 4]]"); + } + + @Test + public void multi011() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("x", "xx", "xxx"); + List<Match> matches = m.find("xxx", 0); + testList(matches, "[[x, 0], [xx, 0], [xxx, 0], [x, 1], [xx, 1], [x, 2]]"); + } + + @Test + public void multi012() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("she", "his", "hers"); + List<Match> matches = m.find("dddhiheddd", 0); + assertEquals("Expected no match", 0, matches.size()); + } + + @Test + public void multi013() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("he", "she", "his", "hers"); + List<Match> matches = m.find("dddhisheddd", 4); + testList(matches, "[[she, 5], [he, 6]]"); + } + + @Test + public void multi014() throws Exception { + MultiStringMatcher m = MultiStringMatcher.create("x", "xx", "xxx"); + List<Match> matches = m.find("xxx", 2); + testList(matches, "[[x, 2]]"); + } + + @Test + public void noStrings001() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().build(); + assertNull("Expected no match", m.indexOf("dhihedd", 0)); + List<Match> matches = m.find("dddhiheddd", 0); + assertEquals("Expected no match", 0, matches.size()); + } + + @Test + public void noStrings002() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add("").build(); + assertNull("Expected no match", m.indexOf("dhihedd", 0)); + List<Match> matches = m.find("dddhiheddd", 0); + assertEquals("Expected no match", 0, matches.size()); + } + + @Test + public void noStrings003() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add((String[]) null).build(); + assertNull("Expected no match", m.indexOf("dhihedd", 0)); + List<Match> matches = m.find("dddhiheddd", 0); + assertEquals("Expected no match", 0, matches.size()); + } + + @Test + public void fluent001() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add("he", "she", "his", "hers").build(); + test(m.indexOf("ushers", 0), "she", 1); + } + + @Test + public void fluent002() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add("he", "she").add("his", "hers").build(); + testList(m.find("ushers", 0), "[[she, 1], [he, 2], [hers, 2]]"); + } + + @Test + public void fluent003() throws Exception { + MultiStringMatcher m = MultiStringMatcher.builder().add("he").add(null, "she", "").add("his", "hers").build(); + testList(m.find("ushers", 0), "[[she, 1], [he, 2], [hers, 2]]"); + } + + @Test + public void addAfterBuild() throws Exception { + MultiStringMatcher.Builder b = MultiStringMatcher.builder().add("he", "she").add("his", "hers"); + b.build(); + thrown.expect(IllegalStateException.class); + b.add("us"); + } + + @Test + public void reuseBuilder() throws Exception { + MultiStringMatcher.Builder b = MultiStringMatcher.builder().add("he", "she").add("his", "hers"); + b.build(); + thrown.expect(IllegalStateException.class); + b.build(); + } +} diff --git a/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java b/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java index 7504167e805..3a0cc75cc47 100644 --- a/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java +++ b/org.eclipse.text.tests/src/org/eclipse/text/tests/TextUtilitiesTest.java @@ -13,7 +13,8 @@ *******************************************************************************/ package org.eclipse.text.tests; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import java.util.ArrayList; import java.util.Iterator; diff --git a/org.eclipse.text/META-INF/MANIFEST.MF b/org.eclipse.text/META-INF/MANIFEST.MF index 82e539f8d2b..4ee82ebb297 100644 --- a/org.eclipse.text/META-INF/MANIFEST.MF +++ b/org.eclipse.text/META-INF/MANIFEST.MF @@ -2,7 +2,7 @@ Manifest-Version: 1.0 Bundle-ManifestVersion: 2 Bundle-Name: %pluginName Bundle-SymbolicName: org.eclipse.text -Bundle-Version: 3.8.300.qualifier +Bundle-Version: 3.9.0.qualifier Bundle-Vendor: %providerName Bundle-Localization: plugin Export-Package: diff --git a/org.eclipse.text/pom.xml b/org.eclipse.text/pom.xml index 1855055d5c2..41e06eee8f7 100644 --- a/org.eclipse.text/pom.xml +++ b/org.eclipse.text/pom.xml @@ -18,6 +18,6 @@ </parent> <groupId>org.eclipse.text</groupId> <artifactId>org.eclipse.text</artifactId> - <version>3.8.300-SNAPSHOT</version> + <version>3.9.0-SNAPSHOT</version> <packaging>eclipse-plugin</packaging> </project> diff --git a/org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java b/org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java index 75b8a932d8e..a9373d014f7 100644 --- a/org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java +++ b/org.eclipse.text/src/org/eclipse/jface/text/ConfigurableLineTracker.java @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (c) 2000, 2008 IBM Corporation and others. + * Copyright (c) 2000, 2019 IBM Corporation and others. * * This program and the accompanying materials * are made available under the terms of the Eclipse Public License 2.0 @@ -10,11 +10,14 @@ * * Contributors: * IBM Corporation - initial API and implementation + * Thomas Wolf - Bug 545252: improved search performance for multiple delimiters *******************************************************************************/ package org.eclipse.jface.text; import org.eclipse.core.runtime.Assert; +import org.eclipse.jface.text.MultiStringMatcher.Match; + /** * Standard implementation of a generic @@ -31,12 +34,12 @@ import org.eclipse.core.runtime.Assert; */ public class ConfigurableLineTracker extends AbstractLineTracker { - /** The strings which are considered being the line delimiter */ - private String[] fDelimiters; + private final String[] fDelimiters; /** A predefined delimiter information which is always reused as return value */ - private DelimiterInfo fDelimiterInfo= new DelimiterInfo(); - + private final DelimiterInfo fDelimiterInfo= new DelimiterInfo(); + /** Util to search the configured line delimiters in text. <code>null</code> if only one delimiter is used. */ + private final MultiStringMatcher fMatcher; /** * Creates a standard line tracker for the given line delimiters. @@ -47,6 +50,7 @@ public class ConfigurableLineTracker extends AbstractLineTracker { public ConfigurableLineTracker(String[] legalLineDelimiters) { Assert.isTrue(legalLineDelimiters != null && legalLineDelimiters.length > 0); fDelimiters= TextUtilities.copy(legalLineDelimiters); + fMatcher= legalLineDelimiters.length > 1 ? MultiStringMatcher.create(legalLineDelimiters) : null; } @Override @@ -56,12 +60,13 @@ public class ConfigurableLineTracker extends AbstractLineTracker { @Override protected DelimiterInfo nextDelimiterInfo(String text, int offset) { - if (fDelimiters.length > 1) { - int[] info= TextUtilities.indexOf(fDelimiters, text, offset); - if (info[0] == -1) + if (fMatcher != null) { + Match m = fMatcher.indexOf(text, offset); + if (m == null) { return null; - fDelimiterInfo.delimiterIndex= info[0]; - fDelimiterInfo.delimiter= fDelimiters[info[1]]; + } + fDelimiterInfo.delimiterIndex= m.getOffset(); + fDelimiterInfo.delimiter= m.getText(); } else { int index= text.indexOf(fDelimiters[0], offset); if (index == -1) diff --git a/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java b/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java new file mode 100644 index 00000000000..daef7f70ab3 --- /dev/null +++ b/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java @@ -0,0 +1,430 @@ +/******************************************************************************* + * Copyright (c) 2019 Paul Pazderski, Thomas Wolf, and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * + * Contributors: + * Paul Pazderski; Thomas Wolf - initial API and implementation + *******************************************************************************/ +package org.eclipse.jface.text; + +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Fast matcher to find the occurrences of any of a fixed set of constant strings. Supports finding + * all (possibly overlapping) matches, or only the leftmost longest match. + * + * @since 3.9 + */ +public class MultiStringMatcher { + + // An implementation of the Aho-Corasick algorithm (without the DFA construction from section 6 of the + // paper; just the failure and output links). + // + // See Aho, Alfred A.; Corasick, Margaret J.: "Efficient String Matching: An Aid to Bibliographic Search", + // CACM 18(6), 1975. + // + // The algorithm has been modified to support both reporting all matches or only leftmost longest matches. + + /** + * Describes a match result of {@link MultiStringMatcher#indexOf(CharSequence, int)}, giving + * access to the matched string and the offset in the text it was matched at. + */ + public static interface Match { + + /** + * Obtains the matched string. + * + * @return the text matched + */ + String getText(); + + /** + * Obtains the offset the {@link #getText() text} was matched at. + * + * @return the offset + */ + int getOffset(); + + } + + /** A Builder for creating a {@link MultiStringMatcher}. */ + public static interface Builder { + + /** + * Adds search strings to be looked for. {@code null} and empty strings in the arguments are + * ignored. + * + * @param searchStrings to add to be looked for by the matcher. + * @return this + * @throws IllegalStateException if the {@link MultiStringMatcher} was already built. + */ + Builder add(String... searchStrings); + + /** + * Returns the {@link MultiStringMatcher} built by this builder. + * <p> + * Note that a {@link Builder} instance can build only <em>one</em> + * {@link MultiStringMatcher} instance. This is by design; otherwise the builder would have + * to store all the searchStrings somewhere, which may be rather memory intensive if a lot + * of search strings are added. + * </p> + * + * @return the {@link MultiStringMatcher} + * @throws IllegalStateException if the {@link MultiStringMatcher} was already built. + */ + MultiStringMatcher build(); + } + + private static class BuilderImpl implements Builder { + + private MultiStringMatcher m; + + BuilderImpl() { + m= new MultiStringMatcher(); + } + + private void check() { + if (m == null) { + throw new IllegalStateException("Builder.build() was already called"); //$NON-NLS-1$ + } + } + + @Override + public Builder add(String... searchStrings) { + check(); + m.add(searchStrings); + return this; + } + + @Override + public MultiStringMatcher build() { + check(); + MultiStringMatcher result= m; + m= null; + if (!result.root.hasChildren()) { + // no search strings were added; return a specialized "matches nothing" matcher + return new MultiStringMatcher() { + @Override + public List<Match> find(CharSequence text, int offset) { + return new LinkedList<>(); + } + + @Override + public Match indexOf(CharSequence text, int offset) { + return null; + } + }; + } + result.buildLinks(); + return result; + } + } + + /** + * Creates an initially empty {@link Builder}. + * + * @return the {@link Builder} + */ + public static Builder builder() { + return new BuilderImpl(); + } + + private static class MatchResult implements Match { + + private final String match; + + private final int offset; + + public MatchResult(String match, int offset) { + this.match= match; + this.offset= offset; + } + + @Override + public String getText() { + return match; + } + + @Override + public int getOffset() { + return offset; + } + + @Override + public int hashCode() { + return Objects.hashCode(match) * 31 + Integer.hashCode(offset); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + MatchResult other= (MatchResult) obj; + return offset == other.offset && Objects.equals(match, other.match); + } + + @Override + public String toString() { + return '[' + match + ", " + offset + ']'; //$NON-NLS-1$ + } + } + + /** A node in the trie built from the search strings. */ + private static class Node { + HashMap<Character, Node> children; + + String match; + + Node fail; + + Node output; + + Node next(Character c) { + return children == null ? null : children.get(c); + } + + Node add(char c) { + if (children == null) { + children= new HashMap<>(); + } + return children.computeIfAbsent(Character.valueOf(c), key -> new Node()); + } + + boolean hasChildren() { + return children != null; + } + + @Override + public String toString() { + return "Match: " + (match == null ? "null" : '>' + match + '<') //$NON-NLS-1$ //$NON-NLS-2$ + + " Children: " + (children == null ? "<none>" : children.keySet().stream().map(c -> c.toString()).collect(Collectors.joining(", "))); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ + } + } + + /** Root node of the trie. */ + private final Node root= new Node() { + @Override + Node next(Character c) { + // Implements the sentinel loop on the root node for all non-matching characters. + Node child= super.next(c); + return child == null ? this : child; + } + }; + + private MultiStringMatcher() { + // Always use a Builder or the static helper methods to create a MultiStringMatcher + } + + private void add(String... searchStrings) { + if (searchStrings != null) { + for (String searchString : searchStrings) { + if (searchString == null || searchString.isEmpty()) { + continue; + } + Node node= root; + for (char c : searchString.toCharArray()) { + node= node.add(c); + } + node.match= searchString; + } + } + } + + private void buildLinks() { + // Build the fail and output links. See the paper referenced at the top; this + // is a one-to-one implementation of the original algorithm. Variable names + // s, r, and state are kept as in the paper. + List<Node> queue= new LinkedList<>(); + for (Node s : root.children.values()) { + if (s.children != null) { + // No need to queue nodes without children since we don't do anything + // with them anyway. + queue.add(s); + } + s.fail= root; + } + while (!queue.isEmpty()) { + Node r= queue.remove(0); + for (Map.Entry<Character, Node> entry : r.children.entrySet()) { + Character c= entry.getKey(); + Node s= entry.getValue(); + if (s.children != null) { + queue.add(s); + } + Node state= r.fail; + Node f; + while ((f= state.next(c)) == null) { + state= state.fail; + } + s.fail= f; + if (f.match != null) { + s.output= f; + } else if (f.output != null) { + s.output= f.output; + } + } + } + } + + /** + * Find the next occurrence of any of the search strings of the {@link MultiStringMatcher} in + * the given {@code text} starting at the given {@code offset}. + * <p> + * Performs a simultaneous search for all the strings, returning the leftmost match. If multiple + * search strings match at the same index, the longest match is returned. + * </p> + * + * @param text to search (not {@code null}) + * @param offset to start searching at + * @return the leftmost longest match found, or {@code null} if no match was found. + */ + public Match indexOf(CharSequence text, int offset) { + // Main search loop of the Aho-Corasick algorithm, modified to stop after + // the leftmost longest match. + // How it works: primary goal -> lowest offset, secondary goal -> longest match. + // We differ between primary and sub matches. When the first match a character we walk down + // a path on the trie to find a match. Any match we found on this way is a primary match and + // any new primary match is better than the one before. Also a primary match always got a better + // offset than any sub or failover match. + // A sub match is a sub pattern of our main trie path. There offset is always after any of the + // primaries. Sub matches are not found in offset order so we must check if a new sub match is + // better than our best so far. + // If we fell off the trie path but found a primary match we are done. No other match can get a lower + // offset. Sometimes we can continue on another trie path through the fail link. If we found no + // failover path we can return our sub match or must restart at root. If we changed to a + // failover path we must check on the first direct match if a sub match from the previous path + // is better. If yes it will be returned if not we continue like we did on the first path. + int textEnd= text.length(); + Match primaryMatch= null; + Match subMatch= null; + boolean failover= false; + Node node= root; + for (int i= offset; i < textEnd; i++) { + Character c= Character.valueOf(text.charAt(i)); + Node next= node.next(c); + if (next == null) { + // Fell off the trie. + if (primaryMatch != null) { + // Return primary match because any other match must have a higher offset. + return primaryMatch; + } + // Search for other trie to change to. + do { + node= node.fail; + } while ((next= node.next(c)) == null); + failover= (node != root); + if (!failover && subMatch != null) { + // We fell of the trie and could not switch to another. Return best sub match if possible. + return subMatch; + } + } + node= next; + if (node.match != null) { + int newOffset= i - node.match.length() + 1; + // On a failover trie the sub match can be better. + // And if it is return it because nothing better will follow. + if (failover && subMatch != null && subMatch.getOffset() < newOffset) { + return subMatch; + } + // Any new primary match is better because all have the same offset but any new must be longer. + primaryMatch= new MatchResult(node.match, newOffset); + if (!node.hasChildren()) { + // We will fall off the trie on next character so we can return right here + return primaryMatch; + } + } + // Check for sub matches but only if there is no primary match because only another primary match can be better. + if (primaryMatch == null) { + Node out= node.output; + if (out != null) { + int newOffset= i - out.match.length() + 1; + if (subMatch == null + || newOffset < subMatch.getOffset() + || (newOffset == subMatch.getOffset() && out.match.length() > subMatch.getText().length())) { + subMatch= new MatchResult(out.match, newOffset); + } + } + } + } + return primaryMatch != null ? primaryMatch : subMatch; + } + + /** + * Finds all occurrences of any of the search strings of the {@link MultiStringMatcher} in the + * given {@code text} starting at the given {@code offset}, including overlapping occurrences. + * + * @param text to search (not {@code null}) + * @param offset to start searching at + * @return a possibly empty list of matches + */ + public List<Match> find(CharSequence text, int offset) { + // Main search loop of the standard Aho-Corasick algorithm. + int textEnd= text.length(); + List<Match> matches= new LinkedList<>(); + Node node= root; + for (int i= offset; i < textEnd; i++) { + Character c= Character.valueOf(text.charAt(i)); + Node next; + while ((next= node.next(c)) == null) { + node= node.fail; + } + node= next; + if (node.match != null) { + matches.add(new MatchResult(node.match, i - node.match.length() + 1)); + } + Node out= node.output; + while (out != null) { + matches.add(new MatchResult(out.match, i - out.match.length() + 1)); + out= out.output; + } + } + return matches; + } + + /** + * Finds the leftmost longest occurrence of any of the given {@code searchStrings} in the + * {@code text} starting at the given {@code offset}. + * <p> + * To match the same set of search strings repeatedly against texts it is more efficient to + * build and re-use a {@link MultiStringMatcher}. + * </p> + * + * @param text to search (not {@code null}) + * @param offset to start searching at + * @param searchStrings to look for; non-{@code null} and non-empty strings are ignored + * @return a {@link Match} describing the match found, or {@code null} if no match was found or + * there are no non-{@code null} non-empty {@code searchStrings} + */ + public static Match indexOf(CharSequence text, int offset, String... searchStrings) { + return create(searchStrings).indexOf(text, offset); + } + + /** + * Creates a {@link MultiStringMatcher} for the given {@code searchStrings}. + * <p> + * If there are no non-{@code null} non-empty {@code searchStrings}, the returned + * {@link MultiStringMatcher} will never match anything. + * </p> + * + * @param searchStrings to look for; non-{@code null} and non-empty strings are ignored + * @return the {@link MultiStringMatcher} + */ + public static MultiStringMatcher create(String... searchStrings) { + return builder().add(searchStrings).build(); + } +} |