Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka Code repository: https://github.com/rasbt/LLMs-from-scratch |
![]() |
Install the additional package requirements for this bonus notebook by uncommenting and running the following cell:
# pip install -r requirements-extra.txt
Comparing Various Byte Pair Encoding (BPE) Implementations#
Using BPE from tiktoken
#
from importlib.metadata import version
print("tiktoken version:", version("tiktoken"))
---------------------------------------------------------------------------
PackageNotFoundError Traceback (most recent call last)
Cell In[2], line 3
1 from importlib.metadata import version
----> 3 print("tiktoken version:", version("tiktoken"))
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:946, in version(distribution_name)
939 def version(distribution_name):
940 """Get the version string for the named package.
941
942 :param distribution_name: The name of the distribution package to query.
943 :return: The version string for the package as defined in the package's
944 "Version" metadata key.
945 """
--> 946 return distribution(distribution_name).version
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:919, in distribution(distribution_name)
913 def distribution(distribution_name):
914 """Get the ``Distribution`` instance for the named package.
915
916 :param distribution_name: The name of the distribution package as a string.
917 :return: A ``Distribution`` instance (or subclass thereof).
918 """
--> 919 return Distribution.from_name(distribution_name)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:518, in Distribution.from_name(cls, name)
516 return dist
517 else:
--> 518 raise PackageNotFoundError(name)
PackageNotFoundError: No package metadata was found for tiktoken
import tiktoken
tik_tokenizer = tiktoken.get_encoding("gpt2")
text = "Hello, world. Is this-- a test?"
integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]
strings = tik_tokenizer.decode(integers)
print(strings)
Hello, world. Is this-- a test?
print(tik_tokenizer.n_vocab)
50257
Using the original BPE implementation used in GPT-2#
from bpe_openai_gpt2 import get_encoder, download_vocab
download_vocab()
Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s]
Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s]
orig_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")
integers = orig_tokenizer.encode(text)
print(integers)
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]
strings = orig_tokenizer.decode(integers)
print(strings)
Hello, world. Is this-- a test?
Using the BPE via Hugging Face transformers#
import transformers
transformers.__version__
/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
'4.49.0'
from transformers import GPT2Tokenizer
hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
hf_tokenizer(strings)["input_ids"]
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]
from transformers import GPT2TokenizerFast
hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained("gpt2")
hf_tokenizer_fast(strings)["input_ids"]
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]
Using my own from-scratch BPE tokenizer#
import os
import sys
import io
import nbformat
import types
def import_from_notebook():
def import_definitions_from_notebook(fullname, names):
current_dir = os.getcwd()
path = os.path.join(current_dir, "..", "05_bpe-from-scratch", fullname + ".ipynb")
path = os.path.normpath(path)
# Load the notebook
if not os.path.exists(path):
raise FileNotFoundError(f"Notebook file not found at: {path}")
with io.open(path, "r", encoding="utf-8") as f:
nb = nbformat.read(f, as_version=4)
# Create a module to store the imported functions and classes
mod = types.ModuleType(fullname)
sys.modules[fullname] = mod
# Go through the notebook cells and only execute function or class definitions
for cell in nb.cells:
if cell.cell_type == "code":
cell_code = cell.source
for name in names:
# Check for function or class definitions
if f"def {name}" in cell_code or f"class {name}" in cell_code:
exec(cell_code, mod.__dict__)
return mod
fullname = "bpe-from-scratch"
names = ["BPETokenizerSimple"]
return import_definitions_from_notebook(fullname, names)
imported_module = import_from_notebook()
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
tokenizer_gpt2 = BPETokenizerSimple()
tokenizer_gpt2.load_vocab_and_merges_from_openai(
vocab_path=os.path.join("gpt2_model", "encoder.json"),
bpe_merges_path=os.path.join("gpt2_model", "vocab.bpe")
)
integers = tokenizer_gpt2.encode(text)
print(integers)
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]
A quick performance benchmark#
with open("../01_main-chapter-code/the-verdict.txt", "r", encoding="utf-8") as f:
raw_text = f.read()
Original OpenAI GPT-2 tokenizer#
%timeit orig_tokenizer.encode(raw_text)
3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Tiktoken OpenAI GPT-2 tokenizer#
%timeit tik_tokenizer.encode(raw_text)
901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Hugging Face OpenAI GPT-2 tokenizer#
%timeit hf_tokenizer(raw_text)["input_ids"]
Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors
11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)["input_ids"]
10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit hf_tokenizer_fast(raw_text)["input_ids"]
Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors
3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)["input_ids"]
3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
My own GPT-2 tokenizer (for educational purposes)#
%timeit tokenizer_gpt2.encode(raw_text)
9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)