Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka Code repository: https://github.com/rasbt/LLMs-from-scratch |
![]() |
Chapter 4 Exercise solutions#
from importlib.metadata import version
print("torch version:", version("torch"))
---------------------------------------------------------------------------
PackageNotFoundError Traceback (most recent call last)
Cell In[1], line 3
1 from importlib.metadata import version
----> 3 print("torch version:", version("torch"))
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:946, in version(distribution_name)
939 def version(distribution_name):
940 """Get the version string for the named package.
941
942 :param distribution_name: The name of the distribution package to query.
943 :return: The version string for the package as defined in the package's
944 "Version" metadata key.
945 """
--> 946 return distribution(distribution_name).version
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:919, in distribution(distribution_name)
913 def distribution(distribution_name):
914 """Get the ``Distribution`` instance for the named package.
915
916 :param distribution_name: The name of the distribution package as a string.
917 :return: A ``Distribution`` instance (or subclass thereof).
918 """
--> 919 return Distribution.from_name(distribution_name)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:518, in Distribution.from_name(cls, name)
516 return dist
517 else:
--> 518 raise PackageNotFoundError(name)
PackageNotFoundError: No package metadata was found for torch
Exercise 4.1: Parameters in the feed forward versus attention module#
from gpt import TransformerBlock
GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 1024,
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate": 0.1,
"qkv_bias": False
}
block = TransformerBlock(GPT_CONFIG_124M)
print(block)
TransformerBlock(
(att): MultiHeadAttention(
(W_query): Linear(in_features=768, out_features=768, bias=False)
(W_key): Linear(in_features=768, out_features=768, bias=False)
(W_value): Linear(in_features=768, out_features=768, bias=False)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(ff): FeedForward(
(layers): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Linear(in_features=3072, out_features=768, bias=True)
)
)
(norm1): LayerNorm()
(norm2): LayerNorm()
(drop_shortcut): Dropout(p=0.1, inplace=False)
)
total_params = sum(p.numel() for p in block.ff.parameters())
print(f"Total number of parameters in feed forward module: {total_params:,}")
Total number of parameters in feed forward module: 4,722,432
total_params = sum(p.numel() for p in block.att.parameters())
print(f"Total number of parameters in attention module: {total_params:,}")
Total number of parameters in attention module: 2,360,064
The results above are for a single transformer block
Optionally multiply by 12 to capture all transformer blocks in the 124M GPT model
Bonus: Mathematical breakdown
For those interested in how these parameter counts are calculated mathematically, you can find the breakdown below (assuming
emb_dim=768
):
Feed forward module:
1st
Linear
layer: 768 inputs × 4×768 outputs + 4×768 bias units = 2,362,3682nd
Linear
layer: 4×768 inputs × 768 outputs + 768 bias units = 2,360,064Total: 1st
Linear
layer + 2ndLinear
layer = 2,362,368 + 2,360,064 = 4,722,432
Attention module:
W_query
: 768 inputs × 768 outputs = 589,824W_key
: 768 inputs × 768 outputs = 589,824W_value
: 768 inputs × 768 outputs = 589,824out_proj
: 768 inputs × 768 outputs + 768 bias units = 590,592Total:
W_query
+W_key
+W_value
+out_proj
= 3×589,824 + 590,592 = 2,360,064
Exercise 4.2: Initialize larger GPT models#
GPT2-small (the 124M configuration we already implemented):
“emb_dim” = 768
“n_layers” = 12
“n_heads” = 12
GPT2-medium:
“emb_dim” = 1024
“n_layers” = 24
“n_heads” = 16
GPT2-large:
“emb_dim” = 1280
“n_layers” = 36
“n_heads” = 20
GPT2-XL:
“emb_dim” = 1600
“n_layers” = 48
“n_heads” = 25
GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 1024,
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate": 0.1,
"qkv_bias": False
}
def get_config(base_config, model_name="gpt2-small"):
GPT_CONFIG = base_config.copy()
if model_name == "gpt2-small":
GPT_CONFIG["emb_dim"] = 768
GPT_CONFIG["n_layers"] = 12
GPT_CONFIG["n_heads"] = 12
elif model_name == "gpt2-medium":
GPT_CONFIG["emb_dim"] = 1024
GPT_CONFIG["n_layers"] = 24
GPT_CONFIG["n_heads"] = 16
elif model_name == "gpt2-large":
GPT_CONFIG["emb_dim"] = 1280
GPT_CONFIG["n_layers"] = 36
GPT_CONFIG["n_heads"] = 20
elif model_name == "gpt2-xl":
GPT_CONFIG["emb_dim"] = 1600
GPT_CONFIG["n_layers"] = 48
GPT_CONFIG["n_heads"] = 25
else:
raise ValueError(f"Incorrect model name {model_name}")
return GPT_CONFIG
def calculate_size(model): # based on chapter code
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")
# Calculate the total size in bytes (assuming float32, 4 bytes per parameter)
total_size_bytes = total_params * 4
# Convert to megabytes
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total size of the model: {total_size_mb:.2f} MB")
from gpt import GPTModel
for model_abbrev in ("small", "medium", "large", "xl"):
model_name = f"gpt2-{model_abbrev}"
CONFIG = get_config(GPT_CONFIG_124M, model_name=model_name)
model = GPTModel(CONFIG)
print(f"\n\n{model_name}:")
calculate_size(model)
gpt2-small:
Total number of parameters: 163,009,536
Number of trainable parameters considering weight tying: 124,412,160
Total size of the model: 621.83 MB
gpt2-medium:
Total number of parameters: 406,212,608
Number of trainable parameters considering weight tying: 354,749,440
Total size of the model: 1549.58 MB
gpt2-large:
Total number of parameters: 838,220,800
Number of trainable parameters considering weight tying: 773,891,840
Total size of the model: 3197.56 MB
gpt2-xl:
Total number of parameters: 1,637,792,000
Number of trainable parameters considering weight tying: 1,557,380,800
Total size of the model: 6247.68 MB
Exercise 4.3: Using separate dropout parameters#
GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 1024,
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate_emb": 0.1, # NEW: dropout for embedding layers
"drop_rate_attn": 0.1, # NEW: dropout for multi-head attention
"drop_rate_shortcut": 0.1, # NEW: dropout for shortcut connections
"qkv_bias": False
}
import torch.nn as nn
from gpt import MultiHeadAttention, LayerNorm, FeedForward
class TransformerBlock(nn.Module):
def __init__(self, cfg):
super().__init__()
self.att = MultiHeadAttention(
d_in=cfg["emb_dim"],
d_out=cfg["emb_dim"],
context_length=cfg["context_length"],
num_heads=cfg["n_heads"],
dropout=cfg["drop_rate_attn"], # NEW: dropout for multi-head attention
qkv_bias=cfg["qkv_bias"])
self.ff = FeedForward(cfg)
self.norm1 = LayerNorm(cfg["emb_dim"])
self.norm2 = LayerNorm(cfg["emb_dim"])
self.drop_shortcut = nn.Dropout(cfg["drop_rate_shortcut"])
def forward(self, x):
# Shortcut connection for attention block
shortcut = x
x = self.norm1(x)
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
x = self.drop_shortcut(x)
x = x + shortcut # Add the original input back
# Shortcut connection for feed-forward block
shortcut = x
x = self.norm2(x)
x = self.ff(x)
x = self.drop_shortcut(x)
x = x + shortcut # Add the original input back
return x
class GPTModel(nn.Module):
def __init__(self, cfg):
super().__init__()
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
self.drop_emb = nn.Dropout(cfg["drop_rate_emb"]) # NEW: dropout for embedding layers
self.trf_blocks = nn.Sequential(
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
self.final_norm = LayerNorm(cfg["emb_dim"])
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
def forward(self, in_idx):
batch_size, seq_len = in_idx.shape
tok_embeds = self.tok_emb(in_idx)
pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]
x = self.drop_emb(x)
x = self.trf_blocks(x)
x = self.final_norm(x)
logits = self.out_head(x)
return logits
import torch
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)