aho corasick in python

The Aho-Corasick algorithm is useful for pattern matching in strings. It constructs a finite automaton that can efficiently search for multiple patterns simultaneously. Here's how to implement Aho-Corasick in Python:

Trie Construction

main.py
class TrieNode:
    def __init__(self, char):
        self.char = char
        self.children = []
        self.word_finished = False
        self.failure = None

def add_node(root, word):
    curr_node = root
    for char in word:
        found_in_child = False
        for child in curr_node.children:
            if child.char == char:
                curr_node = child
                found_in_child = True
                break
        if not found_in_child:
            new_node = TrieNode(char)
            curr_node.children.append(new_node)
            curr_node = new_node
    curr_node.word_finished = True

def construct_trie(words):
    root = TrieNode('')
    for word in words:
        add_node(root, word)
    return root
737 chars
28 lines

Faliure Links

main.py
def create_faliure_links(root):
    q = []
    for node in root.children:
        q.append(node)
        node.failure = root
    while q:
        curr = q.pop(0)
        for child in curr.children:
            q.append(child)
            fail_node = curr.failure
            while fail_node:
                for fail_child in fail_node.children:
                    if fail_child.char == child.char:
                        child.failure = fail_child
                        break
                if child.failure:
                    break
                fail_node = fail_node.failure
            if not child.failure:
                child.failure = root
658 chars
21 lines

Search Trie

main.py
def search_trie(root, text):
    curr_node = root
    matches = {}
    
    for i,c in enumerate(text):
        while curr_node and not curr_node.children:
            curr_node = curr_node.failure
            
        for child in curr_node.children:
            while child and child.char!=c:
                child = child.failure
            curr_node = child
            if not curr_node:
                curr_node = root
                continue
            
            if curr_node.word_finished: # possible match found
                begin_index = i-len(child)+1
                match_length = len(child)
                match = text[begin_index:begin_index+match_length]
                matches[match] = matches.get(match, 0) + 1
            
            fail_node = curr_node.failure
            while fail_node: # possible multiple matches
                for fail_child in fail_node.children:
                    if fail_child.char == c:
                        curr_node = fail_child
                        break
                if curr_node.word_finished: # possible match found
                    begin_index = i-len(child)+1
                    match_length = len(child)
                    match = text[begin_index:begin_index+match_length]
                    matches[match] = matches.get(match, 0) + 1
                fail_node = fail_node.failure
                
    return matches
1406 chars
37 lines

Here's how to use these functions to search for patterns:

main.py
patterns = ["he", "she", "his", "hers"]
text = "ushers"
root = construct_trie(patterns)
create_faliure_links(root)
matches = search_trie(root, text)
print(matches) # {"he": 1, "she": 1, "hers": 1}
197 chars
7 lines

gistlibby LogSnag