-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathdecompile.py
More file actions
125 lines (101 loc) · 3.19 KB
/
Copy pathdecompile.py
File metadata and controls
125 lines (101 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# %%
import argparse
import asyncio
import json
import os
from pathlib import Path
from loguru import logger
import datasets
from datasets import load_from_disk
from declient import DecompilerClient
parser = argparse.ArgumentParser()
parser.add_argument(
'--base-dataset-path', type=str,
)
parser.add_argument(
"--output", type=str,
)
parser.add_argument(
"--only-dump-result", action="store_true",
help="Only dump the result without submitting new tasks",
)
parser.add_argument(
"--ck_id", type=int,
default=None,
)
parser.add_argument(
"--ck_size", type=int,
default=None,
)
parser.add_argument(
"--decompile-client-host", type=str,
default="http://localhost:12337",
)
parser.add_argument(
"--decompilers",
type=lambda x: x.split(','),
help="Comma separated list of decompilers",
default="",
)
args = parser.parse_args()
RESULT = f"{args.output}/result.jsonl"
if not os.path.exists(args.output):
os.makedirs(args.output, exist_ok=True)
only_dump_result = args.only_dump_result
dataset_path = Path(args.base_dataset_path)
ds = load_from_disk((dataset_path / 'compiled_ds').as_posix())
assert isinstance(ds, datasets.Dataset)
if args.ck_id is not None and args.ck_size is not None:
assert args.ck_id >= 0
assert args.ck_size > 0
assert args.ck_id * args.ck_size < len(ds)
end_idx = min(args.ck_id * args.ck_size + args.ck_size, len(ds))
ds = ds.select(range(args.ck_id * args.ck_size, end_idx))
assert isinstance(ds, datasets.Dataset)
do_resume = os.path.exists(RESULT)
client = DecompilerClient(
max_concurrent_requests=50,
persistent_file_path=RESULT,
target_url=args.decompile_client_host,
)
DECOMPILERS = args.decompilers
logger.info(f'Decompilers: {DECOMPILERS}')
assert DECOMPILERS, "No decompilers specified"
logger.info(f'Number of tasks: {len(ds)}')
async def submit_tasks():
for decompiler in DECOMPILERS:
logger.info(f'Submitting tasks for decompiler: {decompiler}')
await asyncio.gather(*[
asyncio.create_task(client.decompile_async(
(dataset_path / row['path']).as_posix(),
[hex(row['addr'])],
decompiler,
save_task_queue=False,
idx=idx, decompiler=decompiler,
)) for idx, row in enumerate(ds)
])
await client.save_task_queue()
def save_result():
# assert isinstance(ds, datasets.Dataset)
with open(RESULT, 'r') as f:
data = json.load(f)
result_map = {
decompiler: [None] * len(ds)
for decompiler in DECOMPILERS
}
for item in data:
if 'result' in item and item.get('status') == 'completed':
idx = item['metadata']['idx']
decompiler = item['metadata']['decompiler']
if item['result'] != '':
result_map[decompiler][idx] = list(item['result'].values())[0]
result_ds = datasets.Dataset.from_dict(result_map)
result_ds.save_to_disk(args.output)
async def main():
if not only_dump_result:
if not do_resume:
await submit_tasks()
logger.info('Waiting for tasks to be completed')
await client.process_task_queue()
save_result()
asyncio.run(main())