Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka Code repository: https://github.com/rasbt/LLMs-from-scratch |
![]() |
Chapter 2 Exercise solutions#
Packages that are being used in this notebook:
from importlib.metadata import version
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))
---------------------------------------------------------------------------
PackageNotFoundError Traceback (most recent call last)
Cell In[1], line 3
1 from importlib.metadata import version
----> 3 print("torch version:", version("torch"))
4 print("tiktoken version:", version("tiktoken"))
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:946, in version(distribution_name)
939 def version(distribution_name):
940 """Get the version string for the named package.
941
942 :param distribution_name: The name of the distribution package to query.
943 :return: The version string for the package as defined in the package's
944 "Version" metadata key.
945 """
--> 946 return distribution(distribution_name).version
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:919, in distribution(distribution_name)
913 def distribution(distribution_name):
914 """Get the ``Distribution`` instance for the named package.
915
916 :param distribution_name: The name of the distribution package as a string.
917 :return: A ``Distribution`` instance (or subclass thereof).
918 """
--> 919 return Distribution.from_name(distribution_name)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:518, in Distribution.from_name(cls, name)
516 return dist
517 else:
--> 518 raise PackageNotFoundError(name)
PackageNotFoundError: No package metadata was found for torch
Exercise 2.1#
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
integers = tokenizer.encode("Akwirw ier")
print(integers)
[33901, 86, 343, 86, 220, 959]
for i in integers:
print(f"{i} -> {tokenizer.decode([i])}")
33901 -> Ak
86 -> w
343 -> ir
86 -> w
220 ->
959 -> ier
tokenizer.encode("Ak")
[33901]
tokenizer.encode("w")
[86]
tokenizer.encode("ir")
[343]
tokenizer.encode("w")
[86]
tokenizer.encode(" ")
[220]
tokenizer.encode("ier")
[959]
tokenizer.decode([33901, 86, 343, 86, 220, 959])
'Akwirw ier'
Exercise 2.2#
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
def __init__(self, txt, tokenizer, max_length, stride):
self.input_ids = []
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):
input_chunk = token_ids[i:i + max_length]
target_chunk = token_ids[i + 1: i + max_length + 1]
self.input_ids.append(torch.tensor(input_chunk))
self.target_ids.append(torch.tensor(target_chunk))
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.target_ids[idx]
def create_dataloader(txt, batch_size=4, max_length=256, stride=128):
# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
# Create dataset
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
# Create dataloader
dataloader = DataLoader(dataset, batch_size=batch_size)
return dataloader
with open("the-verdict.txt", "r", encoding="utf-8") as f:
raw_text = f.read()
tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)
vocab_size = 50257
output_dim = 256
max_len = 4
context_length = max_len
token_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
dataloader = create_dataloader(raw_text, batch_size=4, max_length=2, stride=2)
for batch in dataloader:
x, y = batch
break
x
tensor([[ 40, 367],
[2885, 1464],
[1807, 3619],
[ 402, 271]])
dataloader = create_dataloader(raw_text, batch_size=4, max_length=8, stride=2)
for batch in dataloader:
x, y = batch
break
x
tensor([[ 40, 367, 2885, 1464, 1807, 3619, 402, 271],
[ 2885, 1464, 1807, 3619, 402, 271, 10899, 2138],
[ 1807, 3619, 402, 271, 10899, 2138, 257, 7026],
[ 402, 271, 10899, 2138, 257, 7026, 15632, 438]])