-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path0.13.py
More file actions
136 lines (104 loc) · 4.39 KB
/
0.13.py
File metadata and controls
136 lines (104 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
"""
0.13.py -- Large-Graph Subtree Isomorphism Benchmark
S5.2: Scales the NSG matcher to synthetic trees with 100+ nodes.
Measures query latency vs. document tree size and identifies bottlenecks.
Uses nst_core.py (S5.1 shared library).
"""
import sys, time, random
from nst_core import (NestedSemanticTree, build_query_tree, match_pipeline,
subtree_matches_node, _find_subtree_matches)
# ============================================================================
# SYNTHETIC TREE GENERATION
# ============================================================================
CATEGORIES = ['ACTION', 'ENTITY', 'TENSE', 'LOCATIVE', 'MANNER']
LABELS = {
'ACTION': ['BITE', 'CHASE', 'EAT', 'RUN', 'SEE', 'GIVE', 'TAKE',
'THROW', 'CATCH', 'HIT'],
'ENTITY': ['dog', 'cat', 'man', 'woman', 'child', 'bird', 'fish',
'stick', 'ball', 'book'],
'TENSE': ['PAST', 'PRESENT', 'FUTURE'],
'LOCATIVE': ['park', 'house', 'street', 'forest', 'river', 'YESTERDAY'],
'MANNER': ['quickly', 'slowly', 'carefully', 'together', 'alone'],
}
def make_synthetic_tree(n_nodes, seed=42):
"""Generate a random semantic tree with ~n_nodes."""
random.seed(seed)
tree = NestedSemanticTree(f"Synthetic(n={n_nodes})", "synthetic")
tree.add_node("root", "ROOT", "ACTION", None, float(n_nodes))
# Generate nodes and attach them randomly to existing nodes
available_parents = ["root"]
node_count = 1
while node_count < n_nodes:
cat = random.choice(CATEGORIES)
label = random.choice(LABELS[cat])
parent = random.choice(available_parents)
h = random.uniform(0, tree.nodes["root"].height - 1)
nid = f"n{node_count}"
tree.add_node(nid, label, cat, parent, h)
# Randomly make this a parent candidate
if random.random() < 0.3:
available_parents.append(nid)
node_count += 1
tree.preprocess_lca()
return tree
def make_subtree_query(doc_tree, query_size=5):
"""Extract a random subtree from doc_tree as a query."""
# Pick a random internal node as query root
internal = [nid for nid, n in doc_tree.nodes.items() if n.children]
if not internal:
return None
root_id = random.choice(internal)
query = NestedSemanticTree("Query", "synthetic")
node_map = {}
def copy_subtree(nid, parent_qid=None):
if len(query.nodes) >= query_size:
return
node = doc_tree.nodes[nid]
qid = f"q_{nid}"
query.add_node(qid, node.label, node.category, parent_qid, node.height)
node_map[qid] = nid
for child in node.children:
if len(query.nodes) < query_size:
copy_subtree(child.id, qid)
copy_subtree(root_id)
query.preprocess_lca()
return query
# ============================================================================
# BENCHMARK
# ============================================================================
def benchmark_matcher():
print("=" * 60)
print(" LARGE-GRAPH MATCHER BENCHMARK")
print("=" * 60)
sizes = [10, 20, 50, 100, 200]
print(f"\n {'Doc Size':<10} {'Query':<8} {'Time(ms)':<10} {'Bottleneck'}")
print(f" {'-'*10} {'-'*8} {'-'*10} {'-'*20}")
for size in sizes:
doc = make_synthetic_tree(size, seed=size)
query = make_subtree_query(doc, query_size=min(5, size // 2))
if query is None:
continue
trials = 5
total_time = 0.0
for _ in range(trials):
start = time.time()
mtype, cov, dist_ed, score = match_pipeline(query, doc)
total_time += time.time() - start
ms = total_time / trials * 1000
# Bottleneck analysis
n_nodes = len(doc.nodes)
q_nodes = len(query.nodes)
# Practical complexity: O(|V_q| * |V_d| * b^2)
# where b = avg branching factor
b = sum(len(n.children) for n in doc.nodes.values()) / max(n_nodes, 1)
est_ops = q_nodes * n_nodes * (b ** 2)
bottleneck = f"b={b:.1f}, est_ops={est_ops:.0f}"
print(f" {n_nodes:<10} {q_nodes:<8} {ms:<10.3f} {bottleneck}")
# Summary
print(f"\n >> BENCHMARK COMPLETE <<")
print(f" The matcher is tractable for semantic trees (bounded degree).")
print(f" Production scaling requires candidate filtering (per 0.3.md).")
return 0
if __name__ == "__main__":
sys.exit(benchmark_matcher())