Code for reproducing the experiments in the paper "A Mechanistic Account of Attention Sinks in GPT-2: One Circuit, Broader Implications for Mitigation".
conda create -n sinks python=3.11 -y
conda activate sinks
pip install -r requirements.txtpython experiments_statistical.py --mode bias-term --output-dir resultsOutputs:
results/bias_term_statistical/bq_k_aggregate_plot_truncated.png→ Fig 1results/bias_term_statistical/bq_k_aggregate_plot.png→ Fig 6 (appendix, full histogram)
python experiments_single_input.py --mode epe-bias-proj --output-dir resultsOutput:
results/epe_bias_proj/epe_alignment.png→ Fig 2
python experiments_statistical.py --mode epe-validation --output-dir resultsOutputs:
results/epe_validation_statistical/epe_validation_plot.png→ Fig 3results/epe_validation_statistical/epe_validation_precentiles(numerical values for experiments)
python experiments_statistical.py --mode coord-alignment --output-dir resultsOutputs:
results/coord_alignment_statistical/coord_alignment_histogram_truncated.png→ Fig 4results/coord_alignment_statistical/coord_alignment_histogram.png→ Fig 8 (appendix, full histogram)
python intervention_analysis.py --mode sentence --output-dir resultsOutputs:
results/sentence_analysis/layer_04_avg.pngthroughlayer_11_avg.png→ Fig 5 (layers 4--11)
python intervention_analysis.py --mode dataset --output-dir resultsOutputs:
results/dataset_analysis/bos_attention_summary_mid_layers.txt→ Table 1results/dataset_analysis/bos_attention_summary_mid_layers.csv
python experiments_single_input.py --mode massive-activations --output-dir resultsOutput:
results/massive_activations/massive_activations_in_ppe.png→ Fig 7