# 1/3. NEXT ATTENTION PROGRAM
# Program Goal: predicts what head activations for head
# where tokens only look at next tokens in sequence.
def next_attention(sentence, tokenizer):
= tokenizer([sentence], return_tensors="pt")
toks = len(toks.input_ids[0])
len_seq = np.zeros((len_seq, len_seq))
out for i in range(1, len_seq - 1):
+ 1] = 1
out[i, i 0, 0] = 1
out[-1, 0] = 1
out[
return "Next Head Attention Pattern", out
ViewLLM
Filtering Attention Heads through Automable Interpretability Experiments
Amiri Hayes, Jacob Andreas, Belinda Li
— ADDITIONAL INFO —
THREE PROGRAM EXAMPLES:
# 2/3. DEPENDENCY ATTENTION PROGRAM
# Program Goal: assigns bidirectional uniform
# attention to tokens based on sentence dependency tree.
def dependencies(sentence, tokenizer):
= tokenizer([sentence], return_tensors="pt")
toks = len(toks.input_ids[0])
len_seq = np.zeros((len_seq, len_seq))
out = nlp(" ".join(sentence.split()))
doc for stok in doc:
= stok.i
parent_index for child_stok in stok.children:
= child_stok.i
child_index +1, child_index+1] = 1
out[parent_index+1, parent_index+1] = 1
out[child_index0, 0] = 1
out[-1, 0] = 1
out[= out / out.sum(axis=1, keepdims=True)
out return "Dependency Parsing Pattern", out
# 3/3. PUNCTUATION ATTENTION PROGRAM
# Simple Program: assigns uniform attention to the
# upcoming punctuations in a given sentence.
def punctuation_attention(sentence, tokenizer):
= tokenizer([sentence], return_tensors="pt")
toks = len(toks.input_ids[0])
len_seq = np.zeros((len_seq, len_seq))
out = tokenizer.convert_ids_to_tokens(toks.input_ids[0])
words = set('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
punctuation_set = [i for i, tok in enumerate(words) if any(p in tok for p in punctuation_set)]
punctuation_indices for i in range(len_seq):
= [j for j in punctuation_indices if j > i]
future_punct if future_punct:
for j in future_punct:
= 1.0
out[i, j] /= out[i].sum()
out[i] else:
= 1.0
out[i, i] += 1e-4
out = out / out.sum(axis=1, keepdims=True)
out return "Punctuation Pattern", out
# EXTRA - LINEARLY FIT ATTENTION EXAMPLE:
# Example of linearly combining programs for a better fit:
def linear_fit(sentence, tokenizer, patterns, y):
= []
X for pattern in patterns:
1].flatten())
X.append(pattern(sentence, tokenizer)[= np.array(X).T
X_n
= LinearRegression().fit(X_n, y.flatten())
reg = reg.intercept_ + sum(coef * mat for coef, mat in zip(reg.coef_, X))
out
= len(tokenizer([sentence], return_tensors="pt").input_ids[0])
len_seq = out.reshape((len_seq, len_seq))
out = out / out.sum(axis=1, keepdims=True)
out return "Linear Fit", out, reg.intercept_, reg.coef_
FUTURE STEPS
As of 8/1/2025 →
- Automation:
Expand on currently implemented automated (LLM-generated) programs- First, manually implement additional programs to start a program database
- Then, run the following pipeline to augment database & improve LLM programs
- Sample attention heads that best_fit & linear_fit do not fit well
- Generate hypothesis and program using data and validate hypothesis
- Add program to the database and rerun cycle
- Inference-Time Analyses (Some Questions):
Run experiments for the following questions- Do heads take on the same function across all sentences & all types of sentences?
- Do these programs emerge in different models? Are there programs which only exist in bigger LLMs?
- Do head functions emerge over time during training? If so, when and which ones?
- If heads can change function, is there a correlation between what it is & other patterns the head takens on?
- Structured Evaluation:
Explore program / function compositions
- Determine which programs are co-occuring in heads & develop a composition tree from program database
Expand on existing Chain-of-Thought (COT) Implementation
- Continue comparing activations in Question-Answering contexts when LLMs are given misleading hints