Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka

Code repository: https://github.com/rasbt/LLMs-from-scratch
  • Install the additional package requirements for this bonus notebook by uncommenting and running the following cell:

# pip install -r requirements-extra.txt

Comparing Various Byte Pair Encoding (BPE) Implementations#


 

Using BPE from tiktoken#

from importlib.metadata import version

print("tiktoken version:", version("tiktoken"))
---------------------------------------------------------------------------
PackageNotFoundError                      Traceback (most recent call last)
Cell In[2], line 3
      1 from importlib.metadata import version
----> 3 print("tiktoken version:", version("tiktoken"))

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:946, in version(distribution_name)
    939 def version(distribution_name):
    940     """Get the version string for the named package.
    941 
    942     :param distribution_name: The name of the distribution package to query.
    943     :return: The version string for the package as defined in the package's
    944         "Version" metadata key.
    945     """
--> 946     return distribution(distribution_name).version

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:919, in distribution(distribution_name)
    913 def distribution(distribution_name):
    914     """Get the ``Distribution`` instance for the named package.
    915 
    916     :param distribution_name: The name of the distribution package as a string.
    917     :return: A ``Distribution`` instance (or subclass thereof).
    918     """
--> 919     return Distribution.from_name(distribution_name)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:518, in Distribution.from_name(cls, name)
    516         return dist
    517 else:
--> 518     raise PackageNotFoundError(name)

PackageNotFoundError: No package metadata was found for tiktoken
import tiktoken

tik_tokenizer = tiktoken.get_encoding("gpt2")

text = "Hello, world. Is this-- a test?"
integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]
strings = tik_tokenizer.decode(integers)

print(strings)
Hello, world. Is this-- a test?
print(tik_tokenizer.n_vocab)
50257

 

Using the original BPE implementation used in GPT-2#

from bpe_openai_gpt2 import get_encoder, download_vocab
download_vocab()
Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s]                                                   
Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s]                                                       
orig_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")
integers = orig_tokenizer.encode(text)

print(integers)
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]
strings = orig_tokenizer.decode(integers)

print(strings)
Hello, world. Is this-- a test?

 

Using the BPE via Hugging Face transformers#

import transformers

transformers.__version__
/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
'4.49.0'
from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
hf_tokenizer(strings)["input_ids"]
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]
from transformers import GPT2TokenizerFast

hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained("gpt2")
hf_tokenizer_fast(strings)["input_ids"]
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

 

Using my own from-scratch BPE tokenizer#

import os
import sys
import io
import nbformat
import types

def import_from_notebook():
    def import_definitions_from_notebook(fullname, names):
        current_dir = os.getcwd()
        path = os.path.join(current_dir, "..", "05_bpe-from-scratch", fullname + ".ipynb")
        path = os.path.normpath(path)

        # Load the notebook
        if not os.path.exists(path):
            raise FileNotFoundError(f"Notebook file not found at: {path}")

        with io.open(path, "r", encoding="utf-8") as f:
            nb = nbformat.read(f, as_version=4)

        # Create a module to store the imported functions and classes
        mod = types.ModuleType(fullname)
        sys.modules[fullname] = mod

        # Go through the notebook cells and only execute function or class definitions
        for cell in nb.cells:
            if cell.cell_type == "code":
                cell_code = cell.source
                for name in names:
                    # Check for function or class definitions
                    if f"def {name}" in cell_code or f"class {name}" in cell_code:
                        exec(cell_code, mod.__dict__)
        return mod

    fullname = "bpe-from-scratch"
    names = ["BPETokenizerSimple"]

    return import_definitions_from_notebook(fullname, names)
imported_module = import_from_notebook()
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)

tokenizer_gpt2 = BPETokenizerSimple()
tokenizer_gpt2.load_vocab_and_merges_from_openai(
    vocab_path=os.path.join("gpt2_model", "encoder.json"),
    bpe_merges_path=os.path.join("gpt2_model", "vocab.bpe")
)
integers = tokenizer_gpt2.encode(text)

print(integers)
[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

 

A quick performance benchmark#

with open("../01_main-chapter-code/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

Original OpenAI GPT-2 tokenizer#

%timeit orig_tokenizer.encode(raw_text)
3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Tiktoken OpenAI GPT-2 tokenizer#

%timeit tik_tokenizer.encode(raw_text)
901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Hugging Face OpenAI GPT-2 tokenizer#

%timeit hf_tokenizer(raw_text)["input_ids"]
Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors
11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)["input_ids"]
10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit hf_tokenizer_fast(raw_text)["input_ids"]
Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors
3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)["input_ids"]
3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

My own GPT-2 tokenizer (for educational purposes)#

%timeit tokenizer_gpt2.encode(raw_text)
9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)