36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
|
|
"""2T-INF — Build SOA from 2-grams (Algorithm 1, TODS 2010)."""
|
||
|
|
|
||
|
|
from .soa import SOA
|
||
|
|
|
||
|
|
|
||
|
|
def build_soa(sequences):
|
||
|
|
"""
|
||
|
|
|———— Algorithm 1: 2T-INF ————|
|
||
|
|
Input: finite set of sample strings S
|
||
|
|
Output: SOA G such that S ⊆ L(G)
|
||
|
|
|
||
|
|
For each string a₁...aₙ in S:
|
||
|
|
add edges (src, a₁), (a₁, a₂), ..., (aₙ, sink)
|
||
|
|
"""
|
||
|
|
G = SOA()
|
||
|
|
symbol_states = {}
|
||
|
|
|
||
|
|
for seq in sequences:
|
||
|
|
if not seq:
|
||
|
|
if not G.has_edge(G.src, G.sink):
|
||
|
|
G.add_edge(G.src, G.sink)
|
||
|
|
continue
|
||
|
|
for i, token in enumerate(seq):
|
||
|
|
if token not in symbol_states:
|
||
|
|
symbol_states[token] = G.add_state(token)
|
||
|
|
if i == 0:
|
||
|
|
G.add_edge(G.src, symbol_states[token])
|
||
|
|
if i == len(seq) - 1:
|
||
|
|
G.add_edge(symbol_states[token], G.sink)
|
||
|
|
if i + 1 < len(seq):
|
||
|
|
nxt = seq[i + 1]
|
||
|
|
if nxt not in symbol_states:
|
||
|
|
symbol_states[nxt] = G.add_state(nxt)
|
||
|
|
G.add_edge(symbol_states[token], symbol_states[nxt])
|
||
|
|
return G
|