Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A draft implementation of BPE tokenizer #39

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 87 additions & 2 deletions mlx/data/core/Tokenizer.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
// Copyright © 2023 Apple Inc.

#include <algorithm>
#include <deque>
#include <limits>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <tuple>
Expand Down Expand Up @@ -118,8 +122,14 @@ std::shared_ptr<Graph<int64_t>> tokenize(
Tokenizer::Tokenizer(
std::shared_ptr<const Trie<char>> trie,
bool ignore_unk,
const std::vector<double>& trie_key_scores)
: trie_(trie), ignoreUnk_(ignore_unk), trieKeyScores_(trie_key_scores) {
const std::vector<double>& trie_key_scores,
const std::map<std::pair<int64_t, int64_t>, int64_t>& bigram_merges,
const std::map<std::pair<int64_t, int64_t>, int64_t>& bigram_ranks)
: trie_(trie),
ignoreUnk_(ignore_unk),
trieKeyScores_(trie_key_scores),
bigram_merges_(bigram_merges),
bigram_ranks_(bigram_ranks) {
if (!trie_key_scores.empty() &&
(trie_key_scores.size() != trie->num_keys())) {
throw std::runtime_error(
Expand Down Expand Up @@ -184,6 +194,81 @@ std::vector<int64_t> Tokenizer::tokenize_rand(const std::string& input) const {
return tokens;
}

std::vector<int64_t> Tokenizer::tokenize_bpe_single(
const std::string& input) const {
auto unigrams = tokenize_shortest(input);
std::set<std::pair<int64_t, int64_t>> bigrams;
auto inf = std::numeric_limits<int64_t>::max();

for (auto i = 0; i < unigrams.size() - 1; i++)
bigrams.insert({unigrams[i], unigrams[i + 1]});

if (bigrams.empty())
return unigrams;

auto tokenized = bigrams.empty();
while (!tokenized) {
// Find the bigram with the smallest rank (i.e. highest merge priority)
auto bigram_to_merge = *std::min_element(
bigrams.begin(),
bigrams.end(),
[&](const auto& left, const auto& right) {
auto left_rank = bigram_ranks_.find(left) != bigram_ranks_.end()
? bigram_ranks_.at(left)
: inf;
auto right_rank = bigram_ranks_.find(right) != bigram_ranks_.end()
? bigram_ranks_.at(right)
: inf;
return left_rank < right_rank;
});
// If there is no such a bigram, we are done.
if (bigram_ranks_.find(bigram_to_merge) == bigram_ranks_.end()) {
tokenized = true;
} else {
// Merge the bigram and obtain the resulting token
auto merged_bigram = bigram_merges_.at(bigram_to_merge);
// Update unigrams for the next iteration
std::vector<int64_t> next_unigrams;
auto skip_next = false;
for (auto i = 0; i < unigrams.size() - 1; i++) {
auto left = unigrams[i];
auto right = unigrams[i + 1];
if (skip_next) {
skip_next = false;
} else {
if (bigram_to_merge == std::make_pair(left, right)) {
next_unigrams.push_back(merged_bigram);
skip_next = true;
} else {
next_unigrams.push_back(left);
}
}
}
if (!skip_next)
next_unigrams.push_back(unigrams[unigrams.size() - 1]);
unigrams = next_unigrams;
// Update bigrams for the next iteration
bigrams.clear();
for (auto i = 0; i < unigrams.size() - 1; i++)
bigrams.insert({unigrams[i], unigrams[i + 1]});
tokenized = bigrams.empty();
}
}
return unigrams;
}

std::vector<int64_t> Tokenizer::tokenize_bpe(const std::string& input) const {
std::vector<int64_t> tokens;
std::stringstream input_sstream(input);
std::string lexeme;
// Split by whitespace and tokenize each lexeme.
// This can yield significant performance improvement for naive BPE.
while (getline(input_sstream, lexeme, ' '))
for (auto token : tokenize_bpe_single(lexeme))
tokens.push_back(token);
return tokens;
}

TokenizerIterator::TokenizerIterator(std::shared_ptr<Graph<int64_t>> graph)
: g_(graph) {
edgeIndices_.push_back(0);
Expand Down
9 changes: 8 additions & 1 deletion mlx/data/core/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#pragma once

#include <map>
#include <memory>
#include <string>

Expand All @@ -22,16 +23,22 @@ class Tokenizer {
Tokenizer(
std::shared_ptr<const Trie<char>> trie,
bool ignore_unk = false,
const std::vector<double>& trie_key_scores = {});
const std::vector<double>& trie_key_scores = {},
const std::map<std::pair<int64_t, int64_t>, int64_t>& bigram_merges = {},
const std::map<std::pair<int64_t, int64_t>, int64_t>& bigram_ranks = {});
std::shared_ptr<Graph<int64_t>> tokenize(const std::string& input) const;
std::vector<int64_t> tokenize_shortest(const std::string& input) const;
std::vector<int64_t> tokenize_rand(const std::string& input) const;
std::vector<int64_t> tokenize_bpe(const std::string& input) const;
std::vector<int64_t> tokenize_bpe_single(const std::string& input) const;

private:
std::shared_ptr<const Trie<char>> trie_;
bool ignoreUnk_;
std::vector<double> trieKeyScores_;
bool trieKeyScoresPositive_;
std::map<std::pair<int64_t, int64_t>, int64_t> bigram_merges_;
std::map<std::pair<int64_t, int64_t>, int64_t> bigram_ranks_;
};

class TokenizerIterator {
Expand Down
23 changes: 21 additions & 2 deletions python/src/wrap_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ void init_mlx_data_core(py::module& m) {
"Tokenizer",
R"pbcopy(
A Tokenizer that can be used to tokenize arbitrary strings.

Args:
trie (mlx.data.core.CharTrie): The trie containing the possible tokens.
ignore_unk (bool): Whether unknown tokens should be ignored or
Expand All @@ -211,15 +210,24 @@ void init_mlx_data_core(py::module& m) {
trie node. If left empty each score is assumed equal to 1.
Tokenize shortest minimizes the sum of these scores over
the sequence of tokens.
bigram_merges (dict[tuple[int, int], int]): A dict.
bigram_ranks (dict[tuple[int, int], int]): A dict.
vocab (dict[str, int]): A dict.
)pbcopy")
.def(
py::init<
std::shared_ptr<const Trie<char>>,
bool,
const std::vector<double>&>(),
const std::vector<double>,
const std::map<std::pair<int64_t, int64_t>, int64_t>,
const std::map<std::pair<int64_t, int64_t>, int64_t>&>(),
py::arg("trie"),
py::arg("ignore_unk") = false,
py::arg("trie_key_scores") = std::vector<double>({}),
py::arg("bigram_merges") =
std::map<std::pair<int64_t, int64_t>, int64_t>({}),
py::arg("bigram_ranks") =
std::map<std::pair<int64_t, int64_t>, int64_t>({}),
R"pbcopy(
Make a tokenizer object that can be used to tokenize arbitrary strings.

Expand All @@ -231,6 +239,8 @@ void init_mlx_data_core(py::module& m) {
trie node. If left empty each score is assumed equal to 1.
Tokenize shortest minimizes the sum of these scores over
the sequence of tokens.
bigram_merges (dict[tuple[int, int], int]): A dict.
bigram_ranks (dict[tuple[int, int], int]): A dict.
)pbcopy")
.def(
"tokenize_shortest",
Expand Down Expand Up @@ -270,6 +280,15 @@ void init_mlx_data_core(py::module& m) {
R"pbcopy(
Return the full graph of possible tokenizations.

Args:
input (str): The input string to be tokenized.
)pbcopy")
.def(
"tokenize_bpe",
&Tokenizer::tokenize_bpe,
py::arg("input"),
R"pbcopy(
Return the input tokenized using byte pair encoding.
Args:
input (str): The input string to be tokenized.
)pbcopy");
Expand Down
Loading