diff options
author | Thomas Wolf | 2019-08-21 21:29:06 +0000 |
---|---|---|
committer | Thomas Wolf | 2019-08-26 10:19:15 +0000 |
commit | 63c20ef98f0f0fb1f59a255f7344a16f24eb1e1d (patch) | |
tree | 112a92a8b670e9f3146a02806f00688d2489dab3 | |
parent | e50ee43b2cf76c97c833fe400539c00f9ebe8fc7 (diff) | |
download | eclipse.platform.text-63c20ef98f0f0fb1f59a255f7344a16f24eb1e1d.tar.gz eclipse.platform.text-63c20ef98f0f0fb1f59a255f7344a16f24eb1e1d.tar.xz eclipse.platform.text-63c20ef98f0f0fb1f59a255f7344a16f24eb1e1d.zip |
Bug 550438 - MultiStringMatcher: improve commentsY20190902-0900Y20190829-0900Y20190826-1000S4_13_0_RC1I20190902-1800I20190902-0805I20190902-0600I20190901-1800I20190901-0600I20190831-1800I20190831-0600I20190830-1800I20190830-0550I20190830-0440I20190828-1800I20190828-0600I20190827-1800I20190827-0600I20190826-1800
Changes only comments.
Change-Id: I684d96db07ce5d9b6e950c087fd067c56a375f72
Signed-off-by: Thomas Wolf <thomas.wolf@paranor.ch>
-rw-r--r-- | org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java | 50 |
1 files changed, 31 insertions, 19 deletions
diff --git a/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java b/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java index daef7f70ab3..68128e1d2f9 100644 --- a/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java +++ b/org.eclipse.text/src/org/eclipse/jface/text/MultiStringMatcher.java @@ -31,10 +31,10 @@ public class MultiStringMatcher { // An implementation of the Aho-Corasick algorithm (without the DFA construction from section 6 of the // paper; just the failure and output links). // - // See Aho, Alfred A.; Corasick, Margaret J.: "Efficient String Matching: An Aid to Bibliographic Search", + // See Aho, Alfred V.; Corasick, Margaret J.: "Efficient String Matching: An Aid to Bibliographic Search", // CACM 18(6), 1975. // - // The algorithm has been modified to support both reporting all matches or only leftmost longest matches. + // The algorithm has been modified to support reporting either all matches or only leftmost longest matches. /** * Describes a match result of {@link MultiStringMatcher#indexOf(CharSequence, int)}, giving @@ -296,19 +296,28 @@ public class MultiStringMatcher { public Match indexOf(CharSequence text, int offset) { // Main search loop of the Aho-Corasick algorithm, modified to stop after // the leftmost longest match. - // How it works: primary goal -> lowest offset, secondary goal -> longest match. - // We differ between primary and sub matches. When the first match a character we walk down - // a path on the trie to find a match. Any match we found on this way is a primary match and - // any new primary match is better than the one before. Also a primary match always got a better - // offset than any sub or failover match. - // A sub match is a sub pattern of our main trie path. There offset is always after any of the - // primaries. Sub matches are not found in offset order so we must check if a new sub match is - // better than our best so far. - // If we fell off the trie path but found a primary match we are done. No other match can get a lower - // offset. Sometimes we can continue on another trie path through the fail link. If we found no - // failover path we can return our sub match or must restart at root. If we changed to a - // failover path we must check on the first direct match if a sub match from the previous path - // is better. If yes it will be returned if not we continue like we did on the first path. + // + // To find a match, we pursue a primary goal (lowest offset) and a secondary goal + // (longest match). We differentiate between primary and sub-matches. Matching starts + // by walking down one path of the trie. Any match we find on this path is a primary + // match and any new primary match is better than the one before. A sub-match is a + // matching prefix of a suffix of the text currently scanned along the path. These + // sub-matches occur on paths off the one we're currently following, and they are + // linked in the trie via the output links. Their offset is always greater than that + // of a primary match, and sub-matches further into the 'output' chain are shorter. + // Therefore we are interested only in the first such sub-match. While walking down + // a path, sub-matches off this path are not found in offset order, so we have to + // check whether a new sub-match is better (lower offset, or longer) than a previously + // found sub-match. + // + // When we can't continue matching on the current path, the algorithm uses the fail + // links to try to find an alternate path to match (which would match a suffix of + // what was traversed so far). Therefore, if we already had a primary match, it is + // returned, since any other match must have a higher offset. If there is no alternate + // path, we fall off the trie (the algorithm bring us back to root, and would start + // again from the top). If we have any match, we may stop and return it. If we _do_ + // change to an alternate path but there's a sub-match with a lower offset, we also + // may return that. Otherwise we continue normally on the new path. int textEnd= text.length(); Match primaryMatch= null; Match subMatch= null; @@ -329,7 +338,8 @@ public class MultiStringMatcher { } while ((next= node.next(c)) == null); failover= (node != root); if (!failover && subMatch != null) { - // We fell of the trie and could not switch to another. Return best sub match if possible. + // We fell of the trie and could not switch to another. Return best sub-match + // if possible. return subMatch; } } @@ -341,14 +351,16 @@ public class MultiStringMatcher { if (failover && subMatch != null && subMatch.getOffset() < newOffset) { return subMatch; } - // Any new primary match is better because all have the same offset but any new must be longer. + // Any new primary match is better because all have the same offset but any new one + // must be longer. primaryMatch= new MatchResult(node.match, newOffset); if (!node.hasChildren()) { - // We will fall off the trie on next character so we can return right here + // We will fall off the trie on the next character, so we can return right here return primaryMatch; } } - // Check for sub matches but only if there is no primary match because only another primary match can be better. + // Check for sub matches but only if there is no primary match because only another + // primary match can be better. if (primaryMatch == null) { Node out= node.output; if (out != null) { |