Skip to content

Commit

Permalink
tokens should be sampled without replacement when creating autoencode…
Browse files Browse the repository at this point in the history
…r training data; implemented this in prepare.py
  • Loading branch information
shehper committed Apr 4, 2024
1 parent c1677d4 commit 8b14c2e
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion autoencoder/prepare.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
""""
Prepares training dataset for our autoencoder.
Run on Macbook as
python -u prepare.py --total_contexts=5000 --tokens_per_context=16 --dataset=shakespeare_char --gpt_ckpt_dir=out_sc_1_2_32
"""
import os
import torch
import time
Expand Down Expand Up @@ -48,7 +53,7 @@ def compute_activations():
gpt_model.clear_mlp_activation_hooks()

# Process and store activations
token_locs = torch.randint(block_size, (gpt_batch_size, tokens_per_context))
token_locs = torch.stack([torch.randperm(block_size)[:tokens_per_context] for _ in range(gpt_batch_size)])
data = torch.gather(activations, 1, token_locs.unsqueeze(2).expand(-1, -1, activations.size(2))).view(-1, n_ffwd)
data_storage[shuffled_indices[batch * gpt_batch_size * tokens_per_context: (batch + 1) * gpt_batch_size * tokens_per_context]] = data

Expand Down

0 comments on commit 8b14c2e

Please sign in to comment.