diff --git a/README.md b/README.md index 585aeb5..519752c 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Add the following lines in the dependencies section of your `pom.xml` file. com.fulmicoton multiregexp - 0.3 + 0.5.1 ``` diff --git a/src/main/java/com/fulmicoton/multiregexp/MultiPattern.java b/src/main/java/com/fulmicoton/multiregexp/MultiPattern.java index a65a0b3..0012c71 100644 --- a/src/main/java/com/fulmicoton/multiregexp/MultiPattern.java +++ b/src/main/java/com/fulmicoton/multiregexp/MultiPattern.java @@ -23,15 +23,22 @@ public static MultiPattern of(String... patterns) { return new MultiPattern(Arrays.asList(patterns)); } - public MultiPatternAutomaton makeAutomatonWithPrefix(String prefix) { - final List automata = new ArrayList<>(); - for (final String ptn: this.patterns) { - final String prefixedPattern = prefix + ptn; + public MultiPatternAutomaton makeAutomatonWithPrefix(String prefix, String... exceptions) { + final List automata = new ArrayList<>(this.patterns.size()); + for (final String ptn : this.patterns) { + boolean addPrefix = true; + for (String exception : exceptions) { + if (ptn.startsWith(exception)) { + addPrefix = false; + break; + } + } + final String prefixedPattern = (addPrefix ? prefix: "") + ptn; final Automaton automaton = new RegExp(prefixedPattern).toAutomaton(); automaton.minimize(); automata.add(automaton); } - return MultiPatternAutomaton.make(automata); + return MultiPatternAutomaton.multithreadedMake(automata); } /** @@ -44,15 +51,20 @@ public MultiPatternAutomaton makeAutomatonWithPrefix(String prefix) { * @return A searcher object */ public MultiPatternSearcher searcher() { - final MultiPatternAutomaton searcherAutomaton = makeAutomatonWithPrefix(".*"); - final List indidivualAutomatons = new ArrayList<>(); - for (final String pattern: this.patterns) { + return searcher(true); + } + + + public MultiPatternSearcher searcher(final boolean tableize) { + final MultiPatternAutomaton searcherAutomaton = makeAutomatonWithPrefix(".*", ".*", "^"); + final List individualAutomatons = new ArrayList<>(this.patterns.size()); + for (final String pattern : this.patterns) { final Automaton automaton = new RegExp(pattern).toAutomaton(); automaton.minimize(); automaton.determinize(); - indidivualAutomatons.add(automaton); + individualAutomatons.add(automaton); } - return new MultiPatternSearcher(searcherAutomaton, indidivualAutomatons); + return new MultiPatternSearcher(searcherAutomaton, individualAutomatons, tableize); } diff --git a/src/main/java/com/fulmicoton/multiregexp/MultiPatternAutomaton.java b/src/main/java/com/fulmicoton/multiregexp/MultiPatternAutomaton.java index 43565b0..eaaadd5 100644 --- a/src/main/java/com/fulmicoton/multiregexp/MultiPatternAutomaton.java +++ b/src/main/java/com/fulmicoton/multiregexp/MultiPatternAutomaton.java @@ -1,17 +1,25 @@ package com.fulmicoton.multiregexp; +import java.io.Serializable; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Queue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.CountDownLatch; import dk.brics.automaton.Automaton; import dk.brics.automaton.DkBricsAutomatonHelper; import dk.brics.automaton.State; -public class MultiPatternAutomaton { +public class MultiPatternAutomaton + implements Serializable { + + private static final long serialVersionUID = -8269666436361824366L; public final int[][] accept; final boolean[] atLeastOneAccept; @@ -56,6 +64,112 @@ static MultiState initialState(List automata) { return new MultiState(initialStates); } + static MultiPatternAutomaton multithreadedMake(final List automata) { + for (final Automaton automaton: automata) { + automaton.determinize(); + } + + final char[] points = DkBricsAutomatonHelper.pointsUnion(automata); + + // states that are still to be visited + final Queue statesToVisits = new ConcurrentLinkedQueue<>(); + final MultiState initialState = initialState(automata); + statesToVisits.add(initialState); + + final Map transitionMap = new ConcurrentHashMap<>(); + + final Map multiStateIndex = new ConcurrentHashMap<>(); + multiStateIndex.put(initialState, 0); + + final int numberOfThreads = Runtime.getRuntime().availableProcessors(); + + final Object lockObject = new Object(); + final List activeThreads = Collections.synchronizedList(new ArrayList()); + final CountDownLatch doneSignal = new CountDownLatch(numberOfThreads); + for (int thread = 0; thread < numberOfThreads; thread++) { + new Thread(new Runnable() { + @Override + public void run() { + activeThreads.add(Thread.currentThread()); + + while (true) { + MultiState visitingState; + while ((visitingState = statesToVisits.poll()) != null) { +// assert multiStateIndex.containsKey(visitingState); + + final int[] curTransitions = new int[points.length]; + final int stateId = multiStateIndex.get(visitingState); + transitionMap.put(stateId, curTransitions); + + for (int c = 0; c < points.length; c++) { + final MultiState destState = visitingState.step(points[c]); + if (destState.isNull()) { + curTransitions[c] = -1; + } else { + Integer destStateId; + synchronized (multiStateIndex) { + destStateId = multiStateIndex.get(destState); + if (destStateId == null) { + destStateId = multiStateIndex.size(); + multiStateIndex.put(destState, destStateId); + statesToVisits.add(destState); + synchronized (lockObject) { + // wake a thread to process destState + lockObject.notify(); + } + } + } + curTransitions[c] = destStateId; + } + } + } + activeThreads.remove(Thread.currentThread()); + // if there are no active threads then we are done + if (activeThreads.isEmpty()) { + synchronized (lockObject) { + // wake waiting threads so they can end + lockObject.notifyAll(); + } + // end this thread + break; + } else { + synchronized (lockObject) { + try { + lockObject.wait(); + } + catch (InterruptedException ignore) { + } + } + activeThreads.add(Thread.currentThread()); + } + } + doneSignal.countDown(); + } + }).start(); + } + try { + // wait for all to finish + doneSignal.await(); + } + catch (InterruptedException e) { + e.printStackTrace(); + } + + assert transitionMap.size() == multiStateIndex.size(); + + final int[] transitions = new int[transitionMap.size() * points.length]; + for (final Map.Entry entry : transitionMap.entrySet()) { + System.arraycopy(entry.getValue(), 0, transitions, entry.getKey() * points.length, points.length); + } + + final int[][] acceptValues = new int[multiStateIndex.size()][]; + for (final Map.Entry entry : multiStateIndex.entrySet()) { + acceptValues[entry.getValue()] = entry.getKey().toAcceptValues(); + } + + return new MultiPatternAutomaton(acceptValues, transitions, points, automata.size()); + } + static MultiPatternAutomaton make(final List automata) { for (final Automaton automaton: automata) { automaton.determinize(); diff --git a/src/main/java/com/fulmicoton/multiregexp/MultiPatternSearcher.java b/src/main/java/com/fulmicoton/multiregexp/MultiPatternSearcher.java index ec98398..f8413e2 100644 --- a/src/main/java/com/fulmicoton/multiregexp/MultiPatternSearcher.java +++ b/src/main/java/com/fulmicoton/multiregexp/MultiPatternSearcher.java @@ -6,12 +6,16 @@ import dk.brics.automaton.StatePair; import dk.brics.automaton.Transition; +import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -public class MultiPatternSearcher { +public class MultiPatternSearcher + implements Serializable { + + private static final long serialVersionUID = -1812442985139693661L; private final MultiPatternAutomaton automaton; private final List individualAutomatons; @@ -19,25 +23,31 @@ public class MultiPatternSearcher { MultiPatternSearcher(final MultiPatternAutomaton automaton, final List individualAutomatons) { + this(automaton, individualAutomatons, true); + } + + MultiPatternSearcher(final MultiPatternAutomaton automaton, + final List individualAutomatons, + boolean tableize) { this.automaton = automaton; - this.individualAutomatons = new ArrayList<>(); - for (final Automaton individualAutomaton: individualAutomatons) { - this.individualAutomatons.add(new RunAutomaton(individualAutomaton)); + this.individualAutomatons = new ArrayList<>(individualAutomatons.size()); + for (final Automaton individualAutomaton : individualAutomatons) { + this.individualAutomatons.add(new RunAutomaton(individualAutomaton, tableize)); } - this.inverseAutomatons = new ArrayList<>(this.individualAutomatons.size()); - for (final Automaton individualAutomaton: individualAutomatons) { + this.inverseAutomatons = new ArrayList<>(individualAutomatons.size()); + for (final Automaton individualAutomaton : individualAutomatons) { final Automaton inverseAutomaton = inverseAutomaton(individualAutomaton); - this.inverseAutomatons.add(new RunAutomaton(inverseAutomaton)); + this.inverseAutomatons.add(new RunAutomaton(inverseAutomaton, tableize)); } } static Automaton inverseAutomaton(final Automaton automaton) { - final Map stateMapping = new HashMap<>(); - for (final State state: automaton.getStates()) { + final Map stateMapping = new HashMap<>(automaton.getStates().size()); + for (final State state : automaton.getStates()) { stateMapping.put(state, new State()); } - for (final State state: automaton.getStates()) { - for (final Transition transition: state.getTransitions()) { + for (final State state : automaton.getStates()) { + for (final Transition transition : state.getTransitions()) { final State invDest = stateMapping.get(state); final State invOrig = stateMapping.get(transition.getDest()); invOrig.addTransition(new Transition(transition.getMin(), transition.getMax(), invDest)); @@ -47,8 +57,8 @@ static Automaton inverseAutomaton(final Automaton automaton) { stateMapping.get(automaton.getInitialState()).setAccept(true); final State initialState = new State(); inverseAutomaton.setInitialState(initialState); - final List epsilons = new ArrayList<>(); - for (final State acceptState: automaton.getAcceptStates()) { + final List epsilons = new ArrayList<>(automaton.getAcceptStates().size()); + for (final State acceptState : automaton.getAcceptStates()) { final State invOrigState = stateMapping.get(acceptState); final StatePair statePair = new StatePair(initialState, invOrigState); epsilons.add(statePair); @@ -67,30 +77,105 @@ public Cursor search(CharSequence s, int position) { public class Cursor { private final CharSequence seq; - private int matchingPattern = -1; - private int end = 0; - private int start = -1; + private int[] matchingPatterns = null; + private int[] matchingPatternsStart = null; + private int[] matchingPatternsEnd = null; + private int currentPosition = 0; Cursor(CharSequence seq, int position) { this.seq = seq; - this.end = position; + this.currentPosition = position; } public int start() { - return this.start; + return start(0); + } + + public int start(int patternIndex) { + if (this.matchingPatterns == null) { + return -1; + } else if (this.matchingPatternsStart == null) { + this.matchingPatternsStart = new int[this.matchingPatterns.length]; + for (int i = 0; i < this.matchingPatterns.length; i++) { + this.matchingPatternsStart[i] = -1; + } + } + + if (this.matchingPatternsStart[patternIndex] == -1) { + // we rewind using the backward automaton to find the start of the pattern. + final RunAutomaton backwardAutomaton = inverseAutomatons.get(this.matchingPatterns[patternIndex]); + int state = backwardAutomaton.getInitialState(); + for (int pos = this.currentPosition - 1; pos >= 0; pos--) { + final char c = this.seq.charAt(pos); + state = backwardAutomaton.step(state, c); + if (state == -1) { + break; + } + if (backwardAutomaton.isAccept(state)) { + this.matchingPatternsStart[patternIndex] = pos; + } + } + } + + return this.matchingPatternsStart[patternIndex]; } public int end() { - return this.end; + return end(0); + } + + public int end(int patternIndex) { + if (this.matchingPatterns == null) { + return -1; + } else if (this.matchingPatternsEnd == null) { + this.matchingPatternsEnd = new int[this.matchingPatterns.length]; + } + + if (this.matchingPatternsEnd[patternIndex] == 0) { + final int seqLength = this.seq.length(); + final int start = start(patternIndex); + // we go forward again using the forward automaton to find the end of the pattern. + final RunAutomaton forwardAutomaton = individualAutomatons.get(this.matchingPatterns[patternIndex]); + int state = forwardAutomaton.getInitialState(); + for (int pos = start; pos < seqLength; pos++) { + final char c = this.seq.charAt(pos); + state = forwardAutomaton.step(state, c); + if (state == -1) { + break; + } + if (forwardAutomaton.isAccept(state)) { + this.matchingPatternsEnd[patternIndex] = pos + 1; + } + } + } + + return this.matchingPatternsEnd[patternIndex]; } public int match() { - return this.matchingPattern; + return match(0); + } + + public int match(int patternIndex) { + return this.matchingPatterns == null ? -1: this.matchingPatterns[patternIndex]; + } + + public String pattern() { + return pattern(0); + } + + public String pattern(int patternIndex) { + final RunAutomaton automaton = individualAutomatons.get(this.matchingPatterns[patternIndex]); + return automaton.toString(); + } + + public int[] matches() { + return this.matchingPatterns; } public boolean found() { - return this.matchingPattern >= 0; + return this.matchingPatterns != null; } @@ -114,58 +199,23 @@ public boolean found() { * If no match is found the function return false. */ public boolean next() { - this.start = -1; - this.matchingPattern = -1; + this.matchingPatterns = null; + this.matchingPatternsStart = null; + this.matchingPatternsEnd = null; final int seqLength = this.seq.length(); - { // first find a match and "choose the pattern". - int state = 0; - for (int pos=this.end; pos < seqLength; pos++) { - final char c = this.seq.charAt(pos); - state = automaton.step(state, c); - if (automaton.atLeastOneAccept[state]) { - // We found a match! - this.matchingPattern = automaton.accept[state][0]; - this.end = pos; - break; - } - } - if (this.matchingPattern == -1) { - return false; - } - } - { // we rewind using the backward automaton to find the start of the pattern. - final RunAutomaton backwardAutomaton = inverseAutomatons.get(this.matchingPattern); - int state = backwardAutomaton.getInitialState(); - for (int pos = this.end; pos >= 0; pos--) { - final char c = this.seq.charAt(pos); - state = backwardAutomaton.step(state, c); - if (state == -1) { - break; - } - if (backwardAutomaton.isAccept(state)) { - start = pos; - } + // first find a match and "choose the pattern". + for (int state = 0, pos = this.currentPosition; pos < seqLength; pos++) { + final char c = this.seq.charAt(pos); + state = automaton.step(state, c); + if (automaton.atLeastOneAccept[state]) { + // We found a match! + this.matchingPatterns = automaton.accept[state]; + this.currentPosition = pos + 1; + break; } } - { // we go forward again using the forward automaton to find the end of the pattern. - final RunAutomaton forwardAutomaton = individualAutomatons.get(this.matchingPattern); - int state = forwardAutomaton.getInitialState(); - for (int pos = this.start; pos < seqLength; pos++) { - final char c = this.seq.charAt(pos); - state = forwardAutomaton.step(state, c); - if (state == -1) { - break; - } - if (forwardAutomaton.isAccept(state)) { - this.end = pos + 1; - } - } - } - - return true; + return this.matchingPatterns != null; } - - } }