# 1/3. NEXT ATTENTION PROGRAM
# Program Goal: predicts what head activations for head
# where tokens only look at next tokens in sequence.
def next_attention(sentence, tokenizer):
toks = tokenizer([sentence], return_tensors="pt")
len_seq = len(toks.input_ids[0])
out = np.zeros((len_seq, len_seq))
for i in range(1, len_seq - 1):
out[i, i + 1] = 1
out[0, 0] = 1
out[-1, 0] = 1
return "Next Head Attention Pattern", outViewLLM
Filtering Attention Heads through Automable Interpretability Experiments
Amiri Hayes, Jacob Andreas, Belinda Li
— ADDITIONAL INFO —
THREE PROGRAM EXAMPLES:

# 2/3. DEPENDENCY ATTENTION PROGRAM
# Program Goal: assigns bidirectional uniform
# attention to tokens based on sentence dependency tree.
def dependencies(sentence, tokenizer):
toks = tokenizer([sentence], return_tensors="pt")
len_seq = len(toks.input_ids[0])
out = np.zeros((len_seq, len_seq))
doc = nlp(" ".join(sentence.split()))
for stok in doc:
parent_index = stok.i
for child_stok in stok.children:
child_index = child_stok.i
out[parent_index+1, child_index+1] = 1
out[child_index+1, parent_index+1] = 1
out[0, 0] = 1
out[-1, 0] = 1
out = out / out.sum(axis=1, keepdims=True)
return "Dependency Parsing Pattern", out
# 3/3. PUNCTUATION ATTENTION PROGRAM
# Simple Program: assigns uniform attention to the
# upcoming punctuations in a given sentence.
def punctuation_attention(sentence, tokenizer):
toks = tokenizer([sentence], return_tensors="pt")
len_seq = len(toks.input_ids[0])
out = np.zeros((len_seq, len_seq))
words = tokenizer.convert_ids_to_tokens(toks.input_ids[0])
punctuation_set = set('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
punctuation_indices = [i for i, tok in enumerate(words) if any(p in tok for p in punctuation_set)]
for i in range(len_seq):
future_punct = [j for j in punctuation_indices if j > i]
if future_punct:
for j in future_punct:
out[i, j] = 1.0
out[i] /= out[i].sum()
else:
out[i, i] = 1.0
out += 1e-4
out = out / out.sum(axis=1, keepdims=True)
return "Punctuation Pattern", out
# EXTRA - LINEARLY FIT ATTENTION EXAMPLE:
# Example of linearly combining programs for a better fit:
def linear_fit(sentence, tokenizer, patterns, y):
X = []
for pattern in patterns:
X.append(pattern(sentence, tokenizer)[1].flatten())
X_n = np.array(X).T
reg = LinearRegression().fit(X_n, y.flatten())
out = reg.intercept_ + sum(coef * mat for coef, mat in zip(reg.coef_, X))
len_seq = len(tokenizer([sentence], return_tensors="pt").input_ids[0])
out = out.reshape((len_seq, len_seq))
out = out / out.sum(axis=1, keepdims=True)
return "Linear Fit", out, reg.intercept_, reg.coef_
FUTURE STEPS
As of 8/1/2025 →
- Automation:
Expand on currently implemented automated (LLM-generated) programs- First, manually implement additional programs to start a program database
- Then, run the following pipeline to augment database & improve LLM programs
- Sample attention heads that best_fit & linear_fit do not fit well
- Generate hypothesis and program using data and validate hypothesis
- Add program to the database and rerun cycle
- Inference-Time Analyses (Some Questions):
Run experiments for the following questions- Do heads take on the same function across all sentences & all types of sentences?
- Do these programs emerge in different models? Are there programs which only exist in bigger LLMs?
- Do head functions emerge over time during training? If so, when and which ones?
- If heads can change function, is there a correlation between what it is & other patterns the head takens on?
- Structured Evaluation:
Explore program / function compositions
- Determine which programs are co-occuring in heads & develop a composition tree from program database
Expand on existing Chain-of-Thought (COT) Implementation
- Continue comparing activations in Question-Answering contexts when LLMs are given misleading hints