ml-explore · gboduljak · Jan 18, 2024
diff --git a/mlx/data/core/Tokenizer.cpp b/mlx/data/core/Tokenizer.cpp
@@ -1,6 +1,10 @@
 // Copyright © 2023 Apple Inc.
 
+#include <algorithm>
 #include <deque>
+#include <limits>
+#include <set>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 #include <tuple>
@@ -118,8 +122,14 @@ std::shared_ptr<Graph<int64_t>> tokenize(
 Tokenizer::Tokenizer(
     std::shared_ptr<const Trie<char>> trie,
     bool ignore_unk,
-    const std::vector<double>& trie_key_scores)
-    : trie_(trie), ignoreUnk_(ignore_unk), trieKeyScores_(trie_key_scores) {
+    const std::vector<double>& trie_key_scores,
+    const std::map<std::pair<int64_t, int64_t>, int64_t>& bigram_merges,
+    const std::map<std::pair<int64_t, int64_t>, int64_t>& bigram_ranks)
+    : trie_(trie),
+      ignoreUnk_(ignore_unk),
+      trieKeyScores_(trie_key_scores),
+      bigram_merges_(bigram_merges),
+      bigram_ranks_(bigram_ranks) {
   if (!trie_key_scores.empty() &&
       (trie_key_scores.size() != trie->num_keys())) {
     throw std::runtime_error(
@@ -184,6 +194,81 @@ std::vector<int64_t> Tokenizer::tokenize_rand(const std::string& input) const {
   return tokens;
 }
 
+std::vector<int64_t> Tokenizer::tokenize_bpe_single(
+    const std::string& input) const {
+  auto unigrams = tokenize_shortest(input);
+  std::set<std::pair<int64_t, int64_t>> bigrams;
+  auto inf = std::numeric_limits<int64_t>::max();
+
+  for (auto i = 0; i < unigrams.size() - 1; i++)
+    bigrams.insert({unigrams[i], unigrams[i + 1]});
+
+  if (bigrams.empty())
+    return unigrams;
+
+  auto tokenized = bigrams.empty();
+  while (!tokenized) {
+    // Find the bigram with the smallest rank (i.e. highest merge priority)
+    auto bigram_to_merge = *std::min_element(
+        bigrams.begin(),
+        bigrams.end(),
+        [&](const auto& left, const auto& right) {
+          auto left_rank = bigram_ranks_.find(left) != bigram_ranks_.end()
+              ? bigram_ranks_.at(left)
+              : inf;
+          auto right_rank = bigram_ranks_.find(right) != bigram_ranks_.end()
+              ? bigram_ranks_.at(right)
+              : inf;
+          return left_rank < right_rank;
+        });
+    // If there is no such a bigram, we are done.
+    if (bigram_ranks_.find(bigram_to_merge) == bigram_ranks_.end()) {
+      tokenized = true;
+    } else {
+      // Merge the bigram and obtain the resulting token
+      auto merged_bigram = bigram_merges_.at(bigram_to_merge);
+      // Update unigrams for the next iteration
+      std::vector<int64_t> next_unigrams;
+      auto skip_next = false;
+      for (auto i = 0; i < unigrams.size() - 1; i++) {
+        auto left = unigrams[i];
+        auto right = unigrams[i + 1];
+        if (skip_next) {
+          skip_next = false;
+        } else {
+          if (bigram_to_merge == std::make_pair(left, right)) {
+            next_unigrams.push_back(merged_bigram);
+            skip_next = true;
+          } else {
+            next_unigrams.push_back(left);
+          }
+        }
+      }
+      if (!skip_next)
+        next_unigrams.push_back(unigrams[unigrams.size() - 1]);
+      unigrams = next_unigrams;
+      // Update bigrams for the next iteration
+      bigrams.clear();
+      for (auto i = 0; i < unigrams.size() - 1; i++)
+        bigrams.insert({unigrams[i], unigrams[i + 1]});
+      tokenized = bigrams.empty();
+    }
+  }
+  return unigrams;
+}
+
+std::vector<int64_t> Tokenizer::tokenize_bpe(const std::string& input) const {
+  std::vector<int64_t> tokens;
+  std::stringstream input_sstream(input);
+  std::string lexeme;
+  // Split by whitespace and tokenize each lexeme.
+  // This can yield significant performance improvement for naive BPE.
+  while (getline(input_sstream, lexeme, ' '))
+    for (auto token : tokenize_bpe_single(lexeme))
+      tokens.push_back(token);
+  return tokens;
+}
+
 TokenizerIterator::TokenizerIterator(std::shared_ptr<Graph<int64_t>> graph)
     : g_(graph) {
   edgeIndices_.push_back(0);

diff --git a/mlx/data/core/Tokenizer.h b/mlx/data/core/Tokenizer.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 
@@ -22,16 +23,22 @@ class Tokenizer {
   Tokenizer(
       std::shared_ptr<const Trie<char>> trie,
       bool ignore_unk = false,
-      const std::vector<double>& trie_key_scores = {});
+      const std::vector<double>& trie_key_scores = {},
+      const std::map<std::pair<int64_t, int64_t>, int64_t>& bigram_merges = {},
+      const std::map<std::pair<int64_t, int64_t>, int64_t>& bigram_ranks = {});
   std::shared_ptr<Graph<int64_t>> tokenize(const std::string& input) const;
   std::vector<int64_t> tokenize_shortest(const std::string& input) const;
   std::vector<int64_t> tokenize_rand(const std::string& input) const;
+  std::vector<int64_t> tokenize_bpe(const std::string& input) const;
+  std::vector<int64_t> tokenize_bpe_single(const std::string& input) const;
 
  private:
   std::shared_ptr<const Trie<char>> trie_;
   bool ignoreUnk_;
   std::vector<double> trieKeyScores_;
   bool trieKeyScoresPositive_;
+  std::map<std::pair<int64_t, int64_t>, int64_t> bigram_merges_;
+  std::map<std::pair<int64_t, int64_t>, int64_t> bigram_ranks_;
 };
 
 class TokenizerIterator {

diff --git a/python/src/wrap_core.cpp b/python/src/wrap_core.cpp
@@ -202,7 +202,6 @@ void init_mlx_data_core(py::module& m) {
       "Tokenizer",
       R"pbcopy(
         A Tokenizer that can be used to tokenize arbitrary strings.
-
         Args:
             trie (mlx.data.core.CharTrie): The trie containing the possible tokens.
             ignore_unk (bool): Whether unknown tokens should be ignored or
@@ -211,15 +210,24 @@ void init_mlx_data_core(py::module& m) {
                 trie node. If left empty each score is assumed equal to 1.
                 Tokenize shortest minimizes the sum of these scores over
                 the sequence of tokens.
+            bigram_merges (dict[tuple[int, int], int]): A dict.
+            bigram_ranks (dict[tuple[int, int], int]): A dict.
+            vocab (dict[str, int]): A dict.
       )pbcopy")
       .def(
           py::init<
               std::shared_ptr<const Trie<char>>,
               bool,
-              const std::vector<double>&>(),
+              const std::vector<double>,
+              const std::map<std::pair<int64_t, int64_t>, int64_t>,
+              const std::map<std::pair<int64_t, int64_t>, int64_t>&>(),
           py::arg("trie"),
           py::arg("ignore_unk") = false,
           py::arg("trie_key_scores") = std::vector<double>({}),
+          py::arg("bigram_merges") =
+              std::map<std::pair<int64_t, int64_t>, int64_t>({}),
+          py::arg("bigram_ranks") =
+              std::map<std::pair<int64_t, int64_t>, int64_t>({}),
           R"pbcopy(
             Make a tokenizer object that can be used to tokenize arbitrary strings.
 
@@ -231,6 +239,8 @@ void init_mlx_data_core(py::module& m) {
                     trie node. If left empty each score is assumed equal to 1.
                     Tokenize shortest minimizes the sum of these scores over
                     the sequence of tokens.
+                 bigram_merges (dict[tuple[int, int], int]): A dict.
+                 bigram_ranks (dict[tuple[int, int], int]): A dict.
           )pbcopy")
       .def(
           "tokenize_shortest",
@@ -270,6 +280,15 @@ void init_mlx_data_core(py::module& m) {
           R"pbcopy(
             Return the full graph of possible tokenizations.
 
+            Args:
+                input (str): The input string to be tokenized.
+           )pbcopy")
+      .def(
+          "tokenize_bpe",
+          &Tokenizer::tokenize_bpe,
+          py::arg("input"),
+          R"pbcopy(
+            Return the input tokenized using byte pair encoding.
             Args:
                 input (str): The input string to be tokenized.
            )pbcopy");