-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSConscriptEval
More file actions
83 lines (77 loc) · 3.16 KB
/
Copy pathSConscriptEval
File metadata and controls
83 lines (77 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import os.path
import logging
import random
import subprocess
import shlex
import gzip
import re
import functools
import time
import imp
import sys
import json
import steamroller
# workaround needed to fix bug with SCons and the pickle module
del sys.modules['pickle']
sys.modules['pickle'] = imp.load_module('pickle', *imp.find_module('pickle'))
import pickle
# Variables control various aspects of the experiment. Note that you have to declare
# any variables you want to use here, with reasonable default values, but when you want
# to change/override the default values, do so in the "custom.py" file (see it for an
# example, changing the number of folds).
vars = Variables("custom.py")
vars.AddVariables(
("OUTPUT_WIDTH", "", 5000),
("TEXTS","", ["remus","dooley","Biglow","Todd","Remus","Julius","Huck"]),
("MAX_N","", 4),
("GEN_MODEL", "", "mistralai/Mistral-7B-Instruct-v0.2"),
("GEN_DECODING","","sample"),
("EVAL_MODELS","", ["openai-community/gpt2-large"]),
("NG_SCALINGS", "", ["ng_scalings.json"]),
("RANDOM_STATE", "", 20),
("CHUNK_LEN", "", 32),
("USE_GRID","", 0),
("GRID_TYPE","", "slurm"),
#("GRID_QUEUE","", "cpu"),
#("GRID_ACCOUNT","", ""),
("GRID_GPU_COUNT","", 0),
("GRID_MEMORY", "", "64G"),
("GRID_LABEL", "", "NGStyle"),
("GRID_TIME", "", "48:00:00"),
)
env = Environment(
variables=vars,
ENV=os.environ,
tools=[steamroller.generate],
BUILDERS={
"TrainNgram": Builder(action="python scripts/train_ngram.py --text ${SOURCES[0]} --tokenizer ${MODEL} --output ${TARGETS[0]} --max_n ${MAX_N}"),
"NgramPerplexity": Builder(
action="python scripts/reflect_perplexity.py --input ${SOURCES[0]} --ngram ${SOURCES[1]} --scalings ${SOURCES[2]} --output ${TARGETS[0]} --chunk_len ${CHUNK_LEN}"
),
"ProduceElbow": Builder(
action="python scripts/produce_elbow.py --p_in ${SOURCES[0]} --r_in ${SOURCES[1]} --target ${SOURCES[2]} --output ${TARGETS[0]}"
)
}
)
split_texts = [env.File(f"work/{text}/text.jsonl") for text in env["TEXTS"]]
t_ppls = [env.File(f"work/{text}/perplexities.jsonl") for text in env["TEXTS"]]
g_ppls = [env.File(f"work/{env['GEN_MODEL']}/{text}/{env['GEN_DECODING']}/perplexities.jsonl") for text in env["TEXTS"]]
for model in env["EVAL_MODELS"]:
for scalings in env["NG_SCALINGS"]:
scaling = env.File(f"data/{scalings}")
for text_name, text_in, t_ppl, g_ppl in zip(env["TEXTS"], split_texts, t_ppls, g_ppls):
grams = env.TrainNgram(["work/reflections/${MODEL}/${TEXT_NAME}/ngram.pkl"], text_in, MODEL=model, TEXT_NAME=text_name)
reflection = env.NgramPerplexity(
["work/reflections/${MODEL}/${TEXT_NAME}/reflection.jsonl"],
[text_in, grams, scaling],
MODEL=model,
TEXT_NAME=text_name)
elbow = env.ProduceElbow(
["work/reflections/${MODEL}/${TEXT_NAME}/${GEN}_${DECODE}_elbow.png"],
[g_ppl, reflection, t_ppl],
MODEL=model,
TEXT_NAME=text_name,
GEN = env["GEN_MODEL"],
DECODE = env["GEN_DECODING"]
)