aho-corasick algorithm in python

Here is a Python implementation of Aho-Corasick algorithm for string matching:

main.py
class TrieNode:
    def __init__(self, char):
        self.char = char
        self.children = []
        self.word_finished = False
        self.output = None  # output function value of the node
        self.failure = None  # failure function value of the node

def build_trie(keywords):
    root = TrieNode('')
    for keyword in keywords:
        node = root
        for char in keyword:
            found_in_child = False
            for child in node.children:
                if child.char == char:
                    node = child
                    found_in_child = True
                    break
            if not found_in_child:
                new_node = TrieNode(char)
                node.children.append(new_node)
                node = new_node
        node.word_finished = True
    return root

def build_failure(root):
    queue = []
    for child in root.children:
        queue.append(child)
        child.failure = root
    while queue:
        curr_node = queue.pop(0)
        for child in curr_node.children:
            queue.append(child)
            failure_node = curr_node.failure
            while failure_node:
                if child.char in failure_node.children:
                    child.failure = failure_node.children[failure_node.children.index(child.char)]
                    if child.failure.word_finished:
                        child.output = child.failure
                        break
                failure_node = failure_node.failure

def search_strings(text, root):
    curr_node = root
    key_indices = []
    for i in range(len(text)):
        char = text[i]
        while curr_node and char not in curr_node.children:
            curr_node = curr_node.failure
        if not curr_node:
            curr_node = root
            continue
        curr_node = curr_node.children[curr_node.children.index(char)]
        if curr_node.word_finished:
            match = curr_node
            while match:
                key_indices.append((i - len(match) + 1, match))
                match = match.output
    return key_indices
2078 chars
62 lines

To use the Aho-Corasick algorithm for string matching with given keywords, you can simply call the build_trie, build_failure, and search_strings functions as follows:

main.py
keywords = ['he', 'she', 'his', 'hers']
text = 'ushers'
root = build_trie(keywords)
build_failure(root)
key_indices = search_strings(text, root)
print(key_indices)  # [(2, <__main__.TrieNode object at 0x...>), (3, <__main__.TrieNode object at 0x...>), (3, <__main__.TrieNode object at 0x...>), (4, <__main__.TrieNode object at 0x...>)]
336 chars
7 lines

In the example code above, the keywords list contains the target words (i.e. "he", "she", "his", and "hers") and the text string represents the text to search for those words. The search_strings function returns a list of tuples, where each tuple contains the starting index of a matched keyword and the corresponding node object in the trie.

gistlibby LogSnag