Comparing Various Byte Pair Encoding (BPE) Implementations

Contents

Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka

Code repository: https://github.com/rasbt/LLMs-from-scratch

Install the additional package requirements for this bonus notebook by uncommenting and running the following cell:

# pip install -r requirements-extra.txt

Comparing Various Byte Pair Encoding (BPE) Implementations#

Using BPE from `tiktoken`#

from importlib.metadata import version

print("tiktoken version:", version("tiktoken"))

---------------------------------------------------------------------------
PackageNotFoundError                      Traceback (most recent call last)
Cell In[2], line 3
      1 from importlib.metadata import version
----> 3 print("tiktoken version:", version("tiktoken"))

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:946, in version(distribution_name)
    939 def version(distribution_name):
    940     """Get the version string for the named package.
    941 
    942     :param distribution_name: The name of the distribution package to query.
    943     :return: The version string for the package as defined in the package's
    944         "Version" metadata key.
    945     """
--> 946     return distribution(distribution_name).version

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:919, in distribution(distribution_name)
    913 def distribution(distribution_name):
    914     """Get the ``Distribution`` instance for the named package.
    915 
    916     :param distribution_name: The name of the distribution package as a string.
    917     :return: A ``Distribution`` instance (or subclass thereof).
    918     """
--> 919     return Distribution.from_name(distribution_name)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/metadata/__init__.py:518, in Distribution.from_name(cls, name)
    516         return dist
    517 else:
--> 518     raise PackageNotFoundError(name)

PackageNotFoundError: No package metadata was found for tiktoken

import tiktoken

tik_tokenizer = tiktoken.get_encoding("gpt2")

text = "Hello, world. Is this-- a test?"

integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

strings = tik_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?

print(tik_tokenizer.n_vocab)

Using the original BPE implementation used in GPT-2#

from bpe_openai_gpt2 import get_encoder, download_vocab

download_vocab()

Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s]                                                   
Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s]                                                       

orig_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")

integers = orig_tokenizer.encode(text)

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

strings = orig_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?

Using the BPE via Hugging Face transformers#

import transformers

transformers.__version__

/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

'4.49.0'

from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

hf_tokenizer(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

from transformers import GPT2TokenizerFast

hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained("gpt2")

hf_tokenizer_fast(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

Using my own from-scratch BPE tokenizer#

import os
import sys
import io
import nbformat
import types

def import_from_notebook():
    def import_definitions_from_notebook(fullname, names):
        current_dir = os.getcwd()
        path = os.path.join(current_dir, "..", "05_bpe-from-scratch", fullname + ".ipynb")
        path = os.path.normpath(path)

        # Load the notebook
        if not os.path.exists(path):
            raise FileNotFoundError(f"Notebook file not found at: {path}")

        with io.open(path, "r", encoding="utf-8") as f:
            nb = nbformat.read(f, as_version=4)

        # Create a module to store the imported functions and classes
        mod = types.ModuleType(fullname)
        sys.modules[fullname] = mod

        # Go through the notebook cells and only execute function or class definitions
        for cell in nb.cells:
            if cell.cell_type == "code":
                cell_code = cell.source
                for name in names:
                    # Check for function or class definitions
                    if f"def {name}" in cell_code or f"class {name}" in cell_code:
                        exec(cell_code, mod.__dict__)
        return mod

    fullname = "bpe-from-scratch"
    names = ["BPETokenizerSimple"]

    return import_definitions_from_notebook(fullname, names)

imported_module = import_from_notebook()
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)

tokenizer_gpt2 = BPETokenizerSimple()
tokenizer_gpt2.load_vocab_and_merges_from_openai(
    vocab_path=os.path.join("gpt2_model", "encoder.json"),
    bpe_merges_path=os.path.join("gpt2_model", "vocab.bpe")
)

integers = tokenizer_gpt2.encode(text)

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

A quick performance benchmark#

with open("../01_main-chapter-code/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

Original OpenAI GPT-2 tokenizer#

%timeit orig_tokenizer.encode(raw_text)

3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Tiktoken OpenAI GPT-2 tokenizer#

%timeit tik_tokenizer.encode(raw_text)

901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Hugging Face OpenAI GPT-2 tokenizer#

%timeit hf_tokenizer(raw_text)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors

11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)["input_ids"]

10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit hf_tokenizer_fast(raw_text)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors

3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)["input_ids"]

3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

My own GPT-2 tokenizer (for educational purposes)#

%timeit tokenizer_gpt2.encode(raw_text)

9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)