commit python-tiktoken for openSUSE:Factory

Source-Sync Wed, 05 Mar 2025 05:22:06 -0800

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package python-tiktoken for openSUSE:Factory 
checked in at 2025-03-05 13:42:48
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-tiktoken (Old)
 and      /work/SRC/openSUSE:Factory/.python-tiktoken.new.19136 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-tiktoken"

Wed Mar  5 13:42:48 2025 rev:3 rq:1250388 version:0.9.0

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-tiktoken/python-tiktoken.changes  
2024-11-21 15:19:09.642783278 +0100
+++ 
/work/SRC/openSUSE:Factory/.python-tiktoken.new.19136/python-tiktoken.changes   
    2025-03-05 13:42:50.278375137 +0100
@@ -1,0 +2,11 @@
+Wed Mar  5 09:34:31 UTC 2025 - John Paul Adrian Glaubitz 
<[email protected]>
+
+- Update to version 0.9.0:
+  * Join artifacts
+  * Partial sync of codebase
+  * Partial sync of codebase (#381)
+  * Add a link to PyPI in README (#318)
+  * Improve aarch64 and mac builds (#380)
+  * Partial sync of codebase (#379)
+
+-------------------------------------------------------------------

Old:
----
  tiktoken-0.8.0.tar.zst

New:
----
  tiktoken-0.9.0.tar.zst

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-tiktoken.spec ++++++
--- /var/tmp/diff_new_pack.tPIVil/_old  2025-03-05 13:42:51.186413138 +0100
+++ /var/tmp/diff_new_pack.tPIVil/_new  2025-03-05 13:42:51.190413305 +0100
@@ -1,7 +1,7 @@
 #
 # spec file for package python-tiktoken
 #
-# Copyright (c) 2024 SUSE LLC
+# Copyright (c) 2025 SUSE LLC
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -18,7 +18,7 @@
 
 %{?sle15_python_module_pythons}
 Name:           python-tiktoken
-Version:        0.8.0
+Version:        0.9.0
 Release:        0
 Summary:        Fast BPE tokeniser for use with OpenAI's models
 License:        MIT

++++++ _service ++++++
--- /var/tmp/diff_new_pack.tPIVil/_old  2025-03-05 13:42:51.222414645 +0100
+++ /var/tmp/diff_new_pack.tPIVil/_new  2025-03-05 13:42:51.226414812 +0100
@@ -3,7 +3,7 @@
     <param name="url">https://github.com/openai/tiktoken.git</param>
     <param name="versionformat">@PARENT_TAG@</param>
     <param name="scm">git</param>
-    <param name="revision">0.8.0</param>
+    <param name="revision">0.9.0</param>
     <param name="match-tag">*</param>
     <param name="versionrewrite-pattern">v(\d+\.\d+\.\d+)</param>
     <param name="versionrewrite-replacement">\1</param>

++++++ _servicedata ++++++
--- /var/tmp/diff_new_pack.tPIVil/_old  2025-03-05 13:42:51.246415649 +0100
+++ /var/tmp/diff_new_pack.tPIVil/_new  2025-03-05 13:42:51.250415816 +0100
@@ -1,6 +1,6 @@
 <servicedata>
 <service name="tar_scm">
                 <param 
name="url">https://github.com/openai/tiktoken.git</param>
-              <param 
name="changesrevision">63527649963def8c759b0f91f2eb69a40934e468</param></service></servicedata>
+              <param 
name="changesrevision">e35ab0915e37b919946b70947f1d0854196cb72c</param></service></servicedata>
 (No newline at EOF)
 

++++++ tiktoken-0.8.0.tar.zst -> tiktoken-0.9.0.tar.zst ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/.github/workflows/build_wheels.yml 
new/tiktoken-0.9.0/.github/workflows/build_wheels.yml
--- old/tiktoken-0.8.0/.github/workflows/build_wheels.yml       2024-10-03 
23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/.github/workflows/build_wheels.yml       2025-02-14 
06:53:03.000000000 +0100
@@ -22,7 +22,7 @@
     steps:
       - uses: actions/checkout@v4
 
-      - uses: pypa/[email protected]
+      - uses: pypa/[email protected]
         env:
           CIBW_BUILD: "cp${{ matrix.python-version}}-*"
 
@@ -38,19 +38,14 @@
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest]
+        os: [ubuntu-22.04-arm]
         python-version: [39, 310, 311, 312, 313]
 
     steps:
       - uses: actions/checkout@v4
 
-      - name: Setup up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          platforms: arm64
-
       - name: Build wheels
-        uses: pypa/[email protected]
+        uses: pypa/[email protected]
         env:
           CIBW_BUILD: "cp${{ matrix.python-version}}-*"
           CIBW_ARCHS: aarch64
@@ -85,3 +80,15 @@
         with:
           name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
           path: ./dist/*.tar.gz
+
+  join_artifacts:
+    name: Join artifacts
+    runs-on: ubuntu-latest
+    needs: [build_wheels, build_wheels_aarch64, build_sdist]
+    steps:
+     - name: Merge artifacts
+       uses: actions/upload-artifact/merge@v4
+       with:
+         name: cibw-wheels
+         pattern: cibw-wheels-*
+         delete-merged: true
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/CHANGELOG.md 
new/tiktoken-0.9.0/CHANGELOG.md
--- old/tiktoken-0.8.0/CHANGELOG.md     2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/CHANGELOG.md     2025-02-14 06:53:03.000000000 +0100
@@ -2,6 +2,12 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.9.0]
+- Support for `o1` and `o3` models
+- Better error messages when loading invalid vocabulary files
+- Support for encoding to numpy arrays
+- Delayed imports when not strictly necessary
+
 ## [v0.8.0]
 
 - Support for `o1-` and `chatgpt-4o-` models
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/Cargo.toml 
new/tiktoken-0.9.0/Cargo.toml
--- old/tiktoken-0.8.0/Cargo.toml       2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/Cargo.toml       2025-02-14 06:53:03.000000000 +0100
@@ -1,15 +1,24 @@
 [package]
 name = "tiktoken"
-version = "0.8.0"
+version = "0.9.0"
 edition = "2021"
 rust-version = "1.57.0"
 
 [lib]
-name = "_tiktoken"
-crate-type = ["cdylib"]
+name = "tiktoken"
+crate-type = ["cdylib", "rlib"]
+
+[features]
+default = []
+python = [
+    "pyo3",
+]
 
 [dependencies]
-pyo3 = { version = "0.22.2", default-features = false, features = 
["extension-module", "macros"] }
+pyo3 = { version = "0.22.2", default-features = false, features = [
+    "extension-module",
+    "macros",
+], optional = true }
 
 # tiktoken dependencies
 fancy-regex = "0.13.0"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/README.md new/tiktoken-0.9.0/README.md
--- old/tiktoken-0.8.0/README.md        2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/README.md        2025-02-14 06:53:03.000000000 +0100
@@ -12,7 +12,7 @@
 enc = tiktoken.encoding_for_model("gpt-4o")
 ```
 
-The open source version of `tiktoken` can be installed from PyPI:
+The open source version of `tiktoken` can be installed from 
[PyPI](https://pypi.org/project/tiktoken):
 ```
 pip install tiktoken
 ```
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/pyproject.toml 
new/tiktoken-0.9.0/pyproject.toml
--- old/tiktoken-0.8.0/pyproject.toml   2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/pyproject.toml   2025-02-14 06:53:03.000000000 +0100
@@ -1,12 +1,12 @@
 [project]
 name = "tiktoken"
-version = "0.8.0"
+version = "0.9.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
-license = {file = "LICENSE"}
-authors = [{name = "Shantanu Jain"}, {email = "[email protected]"}]
+license = { file = "LICENSE" }
+authors = [{ name = "Shantanu Jain" }, { email = "[email protected]" }]
 dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
-optional-dependencies = {blobfile = ["blobfile>=2"]}
+optional-dependencies = { blobfile = ["blobfile>=2"] }
 requires-python = ">=3.9"
 
 [project.urls]
@@ -22,9 +22,10 @@
 build-frontend = "build"
 build-verbosity = 1
 
-linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs 
| sh -s -- -y"
+linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs 
| sh -s -- -y --profile minimal"
 linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
 macos.before-all = "rustup target add aarch64-apple-darwin x86_64-apple-darwin"
+macos.environment = { MACOSX_DEPLOYMENT_TARGET = "10.12" }
 
 skip = [
   "*-manylinux_i686",
@@ -39,7 +40,3 @@
 
 before-test = "pip install pytest hypothesis"
 test-command = "pytest {project}/tests --import-mode=append"
-
-[[tool.cibuildwheel.overrides]]
-select = "*linux_aarch64"
-test-command = """python -c 'import tiktoken; enc = 
tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 
995]'"""
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/setup.py new/tiktoken-0.9.0/setup.py
--- old/tiktoken-0.8.0/setup.py 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/setup.py 2025-02-14 06:53:03.000000000 +0100
@@ -10,6 +10,7 @@
             # Between our use of editable installs and wanting to use Rust for 
performance sensitive
             # code, it makes sense to just always use --release
             debug=False,
+            features=["python"],
         )
     ],
     package_data={"tiktoken": ["py.typed"]},
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/src/lib.rs 
new/tiktoken-0.9.0/src/lib.rs
--- old/tiktoken-0.8.0/src/lib.rs       2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/src/lib.rs       2025-02-14 06:53:03.000000000 +0100
@@ -1,19 +1,18 @@
-// This check is new and seems buggy (possibly with PyO3 interaction)
-#![allow(clippy::borrow_deref_ref)]
-
+use std::borrow::Borrow;
+use std::borrow::Cow;
 use std::collections::HashSet;
 use std::num::NonZeroU64;
 use std::thread;
 
 use fancy_regex::Regex;
-use pyo3::exceptions;
+#[cfg(feature = "python")]
 use pyo3::prelude::*;
-use pyo3::pybacked::PyBackedStr;
-use pyo3::types::{PyBytes, PyList, PyTuple};
-use pyo3::PyResult;
 use rustc_hash::FxHashMap as HashMap;
 
-type Rank = u32;
+#[cfg(feature = "python")]
+mod py;
+
+pub type Rank = u32;
 
 fn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> 
Vec<(usize, Rank)> {
     // This is a vector of (start, rank).
@@ -132,7 +131,7 @@
 // The current implementation ends up doing a lot of hashing of bytes. In 
theory, this could be made
 // to be hashing of two-tuples of ints, which looks like it may also be a 
couple percent faster.
 
-pub struct FakeThreadId(NonZeroU64);
+struct FakeThreadId(NonZeroU64);
 
 fn hash_current_thread() -> usize {
     // It's easier to use unsafe than to use nightly. Rust has this nice u64 
thread id counter
@@ -148,8 +147,8 @@
 }
 
 #[derive(Debug, Clone)]
-struct DecodeKeyError {
-    token: Rank,
+pub struct DecodeKeyError {
+    pub token: Rank,
 }
 
 impl std::fmt::Display for DecodeKeyError {
@@ -158,10 +157,26 @@
     }
 }
 
+impl std::error::Error for DecodeKeyError {}
+
+#[derive(Debug, Clone)]
+pub struct DecodeError {
+    pub message: String,
+}
+
+impl std::fmt::Display for DecodeError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "Could not decode tokens: {}", self.message)
+    }
+}
+
+impl std::error::Error for DecodeError {}
+
 const MAX_NUM_THREADS: usize = 128;
 
-#[pyclass]
-struct CoreBPE {
+#[cfg_attr(feature = "python", pyclass)]
+#[derive(Clone)]
+pub struct CoreBPE {
     encoder: HashMap<Vec<u8>, Rank>,
     special_tokens_encoder: HashMap<String, Rank>,
     decoder: HashMap<Rank, Vec<u8>>,
@@ -183,7 +198,10 @@
         &self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS]
     }
 
-    fn _decode_native(&self, tokens: &[Rank]) -> Result<Vec<u8>, 
DecodeKeyError> {
+    /// Decodes tokens into a list of bytes.
+    ///
+    /// The bytes are not gauranteed to be a valid utf-8 string.
+    fn decode_bytes(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError> 
{
         let mut ret = Vec::with_capacity(tokens.len() * 2);
         for &token in tokens {
             let token_bytes = match self.decoder.get(&token) {
@@ -198,7 +216,7 @@
         Ok(ret)
     }
 
-    fn _encode_ordinary_native(&self, text: &str) -> Vec<Rank> {
+    pub fn encode_ordinary(&self, text: &str) -> Vec<Rank> {
         // This is the core of the encoding logic; the other functions in here
         // just make things complicated :-)
         let regex = self._get_tl_regex();
@@ -213,7 +231,7 @@
         ret
     }
 
-    fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> 
(Vec<Rank>, usize) {
+    pub fn encode(&self, text: &str, allowed_special: &HashSet<&str>) -> 
(Vec<Rank>, usize) {
         let special_regex = self._get_tl_special_regex();
         let regex = self._get_tl_regex();
         let mut ret = vec![];
@@ -308,12 +326,12 @@
         (tokens, last_piece_token_len)
     }
 
-    fn _encode_unstable_native(
+    pub fn _encode_unstable_native(
         &self,
         text: &str,
         allowed_special: &HashSet<&str>,
     ) -> (Vec<Rank>, HashSet<Vec<Rank>>) {
-        let (tokens, last_piece_token_len) = self._encode_native(text, 
allowed_special);
+        let (tokens, last_piece_token_len) = self.encode(text, 
allowed_special);
         if last_piece_token_len == 0 {
             // If last_piece_token_len is zero, the last token was a special 
token and we have
             // no unstable bytes
@@ -323,7 +341,7 @@
             self._increase_last_piece_token_len(tokens, last_piece_token_len);
 
         let unstable_bytes = self
-            ._decode_native(&tokens[tokens.len() - last_piece_token_len..])
+            .decode_bytes(&tokens[tokens.len() - last_piece_token_len..])
             .unwrap();
         tokens.truncate(tokens.len() - last_piece_token_len);
 
@@ -372,7 +390,7 @@
                     // So convert to UTF-8 and do regex splitting.
                     // E.g. with cl100k_base "  !" gets split to " " + " !",
                     // but byte_pair_encode("  !") != byte_pair_encode(" ")
-                    Ok(s) => self._encode_ordinary_native(s),
+                    Ok(s) => self.encode_ordinary(s),
 
                     // Technically, whether or not this arm is correct depends 
on whether there
                     // would be a regex split before the UTF-8 truncation 
point.
@@ -425,26 +443,37 @@
 
         (tokens, completions)
     }
-}
 
-#[pymethods]
-impl CoreBPE {
-    #[new]
-    fn new(
+    pub fn new<E, SE, NSE>(
+        encoder: E,
+        special_tokens_encoder: SE,
+        pattern: &str,
+    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>>
+    where
+        E: IntoIterator<Item = (Vec<u8>, Rank)>,
+        SE: IntoIterator<Item = (String, Rank)>,
+        NSE: IntoIterator<Item = (String, (Rank, Rank))>,
+    {
+        Self::new_internal(
+            HashMap::from_iter(encoder),
+            HashMap::from_iter(special_tokens_encoder),
+            pattern,
+        )
+    }
+
+    fn new_internal(
         encoder: HashMap<Vec<u8>, Rank>,
         special_tokens_encoder: HashMap<String, Rank>,
         pattern: &str,
-    ) -> PyResult<Self> {
-        let regex = Regex::new(pattern)
-            .map_err(|e| PyErr::new::<exceptions::PyValueError, 
_>(e.to_string()))?;
+    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
+        let regex = Regex::new(pattern)?;
 
         let special_regex = {
-            let _parts = special_tokens_encoder
+            let parts = special_tokens_encoder
                 .keys()
                 .map(|s| fancy_regex::escape(s))
                 .collect::<Vec<_>>();
-            Regex::new(&_parts.join("|"))
-                .map_err(|e| PyErr::new::<exceptions::PyValueError, 
_>(e.to_string()))?
+            Regex::new(&parts.join("|"))?
         };
 
         let decoder: HashMap<Rank, Vec<u8>> =
@@ -464,7 +493,7 @@
         let mut sorted_token_bytes: Vec<Vec<u8>> = 
encoder.keys().cloned().collect();
         sorted_token_bytes.sort();
 
-        Ok(CoreBPE {
+        Ok(Self {
             encoder,
             special_tokens_encoder,
             decoder,
@@ -477,208 +506,22 @@
         })
     }
 
-    // ====================
-    // Encoding
-    // ====================
-
-    fn encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> {
-        py.allow_threads(|| self._encode_ordinary_native(text))
-    }
-
-    fn encode(&self, py: Python, text: &str, allowed_special: 
HashSet<PyBackedStr>) -> Vec<Rank> {
-        py.allow_threads(|| {
-            let allowed_special: HashSet<&str> =
-                allowed_special.iter().map(|s| s.as_ref()).collect();
-            self._encode_native(text, &allowed_special).0
-        })
-    }
-
-    fn encode_to_tiktoken_buffer(
-        &self,
-        py: Python,
-        text: &str,
-        allowed_special: HashSet<PyBackedStr>,
-    ) -> Py<PyAny> {
-        let tokens = py.allow_threads(|| {
-            let allowed_special: HashSet<&str> =
-                allowed_special.iter().map(|s| s.as_ref()).collect();
-            self._encode_native(text, &allowed_special).0
-        });
-        let buffer = TiktokenBuffer { tokens };
-        buffer.into_py(py)
-    }
-
-    fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> {
-        py.allow_threads(|| {
-            match std::str::from_utf8(bytes) {
-                Ok(text) => self._encode_ordinary_native(text),
-                Err(e) => {
-                    let text = unsafe { 
std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
-                    let (tokens, last_piece_token_len) = 
self._encode_native(text, &HashSet::new());
-                    let (mut tokens, last_piece_token_len) =
-                        self._increase_last_piece_token_len(tokens, 
last_piece_token_len);
-                    if !tokens.is_empty() && last_piece_token_len > 0 {
-                        // Lop off the tokens from the last piece and run BPE 
on the remaining bytes
-                        // Somewhat niche, but this may not be correct if we'd 
have had a regex
-                        // split between the valid UTF-8 and the invalid 
bytes, which is why this
-                        // method is private
-                        let mut unstable_bytes = self
-                            ._decode_native(&tokens[tokens.len() - 
last_piece_token_len..])
-                            .unwrap();
-                        
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
-
-                        tokens.truncate(tokens.len() - last_piece_token_len);
-                        match self.encoder.get(&unstable_bytes) {
-                            Some(token) => tokens.push(*token),
-                            None => {
-                                
tokens.extend(&byte_pair_encode(&unstable_bytes, &self.encoder))
-                            }
-                        }
-                    }
-                    tokens
-                }
-            }
-        })
-    }
-
-    fn encode_with_unstable(
-        &self,
-        py: Python,
-        text: &str,
-        allowed_special: HashSet<PyBackedStr>,
-    ) -> Py<PyTuple> {
-        let (tokens, completions) = py.allow_threads(|| {
-            let allowed_special: HashSet<&str> =
-                allowed_special.iter().map(|s| s.as_ref()).collect();
-            self._encode_unstable_native(text, &allowed_special)
-        });
-        let py_completions = PyList::new_bound(
-            py,
-            completions
-                .iter()
-                .map(|seq| PyList::new_bound(py, &seq[..])),
-        );
-        (tokens, py_completions).into_py(py)
-    }
-
-    fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> {
-        if let Some(token) = self.encoder.get(piece).copied() {
-            return Ok(token);
-        }
-        if let Ok(piece_str) = std::str::from_utf8(piece) {
-            if let Some(token) = 
self.special_tokens_encoder.get(piece_str).copied() {
-                return Ok(token);
-            }
-        }
-        Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned()))
-    }
-
-    fn encode_single_piece(&self, piece: &[u8]) -> Vec<Rank> {
-        if let Some(token) = self.encoder.get(piece) {
-            return vec![*token];
-        }
-        byte_pair_encode(piece, &self.encoder)
-    }
-
-    // ====================
-    // Decoding
-    // ====================
-
-    fn decode_bytes(&self, py: Python, tokens: Vec<Rank>) -> 
Result<Py<PyBytes>, PyErr> {
-        match py.allow_threads(|| self._decode_native(&tokens)) {
-            Ok(bytes) => Ok(PyBytes::new_bound(py, &bytes).into()),
-            Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}", 
e))),
-        }
-    }
-
-    fn decode_single_token_bytes(&self, py: Python, token: Rank) -> 
PyResult<Py<PyBytes>> {
-        if let Some(bytes) = self.decoder.get(&token) {
-            return Ok(PyBytes::new_bound(py, bytes).into());
-        }
-        if let Some(bytes) = self.special_tokens_decoder.get(&token) {
-            return Ok(PyBytes::new_bound(py, bytes).into());
-        }
-        Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))
-    }
-
-    // ====================
-    // Miscellaneous
-    // ====================
-
-    fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {
-        self.sorted_token_bytes
-            .iter()
-            .map(|x| PyBytes::new_bound(py, x).into())
+    pub fn special_tokens(&self) -> HashSet<&str> {
+        self.special_tokens_encoder
+            .keys()
+            .map(|s| s.as_str())
             .collect()
     }
-}
-
-#[pyclass]
-struct TiktokenBuffer {
-    tokens: Vec<Rank>,
-}
-
-#[pymethods]
-impl TiktokenBuffer {
-    // Based on 
https://github.com/PyO3/pyo3/blob/v0.22.2/tests/test_buffer_protocol.rs#L25
-    unsafe fn __getbuffer__(
-        slf: Bound<'_, Self>,
-        view: *mut pyo3::ffi::Py_buffer,
-        flags: std::os::raw::c_int,
-    ) -> PyResult<()> {
-        if view.is_null() {
-            return Err(pyo3::exceptions::PyBufferError::new_err("View is 
null"));
-        }
-        if (flags & pyo3::ffi::PyBUF_WRITABLE) == pyo3::ffi::PyBUF_WRITABLE {
-            return Err(pyo3::exceptions::PyBufferError::new_err(
-                "Object is not writable",
-            ));
-        }
-
-        (*view).obj = slf.clone().into_any().into_ptr();
-
-        let data = &slf.borrow().tokens;
-        (*view).buf = data.as_ptr() as *mut std::os::raw::c_void;
-        (*view).len = (data.len() * std::mem::size_of::<Rank>()) as isize;
-        (*view).readonly = 1;
-        (*view).itemsize = std::mem::size_of::<Rank>() as isize;
-        (*view).format = if (flags & pyo3::ffi::PyBUF_FORMAT) == 
pyo3::ffi::PyBUF_FORMAT {
-            let msg = std::ffi::CString::new("I").unwrap();
-            msg.into_raw()
-        } else {
-            std::ptr::null_mut()
-        };
-        (*view).ndim = 1;
-        (*view).shape = if (flags & pyo3::ffi::PyBUF_ND) == 
pyo3::ffi::PyBUF_ND {
-            &mut (*view).len
-        } else {
-            std::ptr::null_mut()
-        };
-        (*view).strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == 
pyo3::ffi::PyBUF_STRIDES {
-            &mut (*view).itemsize
-        } else {
-            std::ptr::null_mut()
-        };
-        (*view).suboffsets = std::ptr::null_mut();
-        (*view).internal = std::ptr::null_mut();
-
-        Ok(())
-    }
 
-    unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) {
-        std::mem::drop(std::ffi::CString::from_raw((*view).format));
+    pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {
+        let allowed_special = self.special_tokens();
+        self.encode(text, &allowed_special).0
     }
 }
 
-#[pymodule]
-fn _tiktoken(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
-    m.add_class::<CoreBPE>()?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
-
+    use fancy_regex::Regex;
     use rustc_hash::FxHashMap as HashMap;
 
     use crate::{byte_pair_split, Rank};
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/src/py.rs new/tiktoken-0.9.0/src/py.rs
--- old/tiktoken-0.8.0/src/py.rs        1970-01-01 01:00:00.000000000 +0100
+++ new/tiktoken-0.9.0/src/py.rs        2025-02-14 06:53:03.000000000 +0100
@@ -0,0 +1,236 @@
+use std::collections::HashSet;
+
+use pyo3::{
+    exceptions,
+    prelude::*,
+    pybacked::PyBackedStr,
+    types::{PyBytes, PyList, PyTuple},
+    PyResult,
+};
+use rustc_hash::FxHashMap as HashMap;
+
+use crate::{byte_pair_encode, CoreBPE, Rank};
+
+#[pymethods]
+impl CoreBPE {
+    #[new]
+    fn py_new(
+        encoder: HashMap<Vec<u8>, Rank>,
+        special_tokens_encoder: HashMap<String, Rank>,
+        pattern: &str,
+    ) -> PyResult<Self> {
+        Self::new_internal(
+            encoder,
+            special_tokens_encoder,
+            pattern,
+        )
+        .map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))
+    }
+
+    // ====================
+    // Encoding
+    // ====================
+
+    #[pyo3(name = "encode_ordinary")]
+    fn py_encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> {
+        py.allow_threads(|| self.encode_ordinary(text))
+    }
+
+    #[pyo3(name = "encode")]
+    fn py_encode(
+        &self,
+        py: Python,
+        text: &str,
+        allowed_special: HashSet<PyBackedStr>,
+    ) -> Vec<Rank> {
+        py.allow_threads(|| {
+            let allowed_special: HashSet<&str> =
+                allowed_special.iter().map(|s| s.as_ref()).collect();
+            self.encode(text, &allowed_special).0
+        })
+    }
+
+    fn encode_to_tiktoken_buffer(
+        &self,
+        py: Python,
+        text: &str,
+        allowed_special: HashSet<PyBackedStr>,
+    ) -> Py<PyAny> {
+        let tokens = py.allow_threads(|| {
+            let allowed_special: HashSet<&str> =
+                allowed_special.iter().map(|s| s.as_ref()).collect();
+            self.encode(text, &allowed_special).0
+        });
+        let buffer = TiktokenBuffer { tokens };
+        buffer.into_py(py)
+    }
+
+    fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> {
+        py.allow_threads(|| {
+            match std::str::from_utf8(bytes) {
+                Ok(text) => self.encode_ordinary(text),
+                Err(e) => {
+                    let text = unsafe { 
std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
+                    let (tokens, last_piece_token_len) = self.encode(text, 
&HashSet::new());
+                    let (mut tokens, last_piece_token_len) =
+                        self._increase_last_piece_token_len(tokens, 
last_piece_token_len);
+                    if !tokens.is_empty() && last_piece_token_len > 0 {
+                        // Lop off the tokens from the last piece and run BPE 
on the remaining bytes
+                        // Somewhat niche, but this may not be correct if we'd 
have had a regex
+                        // split between the valid UTF-8 and the invalid 
bytes, which is why this
+                        // method is private
+                        let mut unstable_bytes = self
+                            .decode_bytes(&tokens[tokens.len() - 
last_piece_token_len..])
+                            .unwrap();
+                        
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
+
+                        tokens.truncate(tokens.len() - last_piece_token_len);
+                        match self.encoder.get(&unstable_bytes) {
+                            Some(token) => tokens.push(*token),
+                            None => {
+                                
tokens.extend(&byte_pair_encode(&unstable_bytes, &self.encoder))
+                            }
+                        }
+                    }
+                    tokens
+                }
+            }
+        })
+    }
+
+    #[pyo3(name = "encode_with_unstable")]
+    fn py_encode_with_unstable(
+        &self,
+        py: Python,
+        text: &str,
+        allowed_special: HashSet<PyBackedStr>,
+    ) -> Py<PyTuple> {
+        let (tokens, completions) = py.allow_threads(|| {
+            let allowed_special: HashSet<&str> =
+                allowed_special.iter().map(|s| s.as_ref()).collect();
+            self._encode_unstable_native(text, &allowed_special)
+        });
+        let py_completions = PyList::new_bound(
+            py,
+            completions
+                .iter()
+                .map(|seq| PyList::new_bound(py, &seq[..])),
+        );
+        (tokens, py_completions).into_py(py)
+    }
+
+    fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> {
+        if let Some(token) = self.encoder.get(piece).copied() {
+            return Ok(token);
+        }
+        if let Ok(piece_str) = std::str::from_utf8(piece) {
+            if let Some(token) = 
self.special_tokens_encoder.get(piece_str).copied() {
+                return Ok(token);
+            }
+        }
+        Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned()))
+    }
+
+    fn encode_single_piece(&self, piece: &[u8]) -> Vec<Rank> {
+        if let Some(token) = self.encoder.get(piece) {
+            return vec![*token];
+        }
+        byte_pair_encode(piece, &self.encoder)
+    }
+
+    // ====================
+    // Decoding
+    // ====================
+
+    #[pyo3(name = "decode_bytes")]
+    fn py_decode_bytes(&self, py: Python, tokens: Vec<Rank>) -> 
Result<Py<PyBytes>, PyErr> {
+        match py.allow_threads(|| self.decode_bytes(&tokens)) {
+            Ok(bytes) => Ok(PyBytes::new_bound(py, &bytes).into()),
+            Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}", 
e))),
+        }
+    }
+
+    fn decode_single_token_bytes(&self, py: Python, token: Rank) -> 
PyResult<Py<PyBytes>> {
+        if let Some(bytes) = self.decoder.get(&token) {
+            return Ok(PyBytes::new_bound(py, bytes).into());
+        }
+        if let Some(bytes) = self.special_tokens_decoder.get(&token) {
+            return Ok(PyBytes::new_bound(py, bytes).into());
+        }
+        Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))
+    }
+
+    // ====================
+    // Miscellaneous
+    // ====================
+
+    fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {
+        self.sorted_token_bytes
+            .iter()
+            .map(|x| PyBytes::new_bound(py, x).into())
+            .collect()
+    }
+}
+
+#[pyclass]
+struct TiktokenBuffer {
+    tokens: Vec<Rank>,
+}
+
+#[pymethods]
+impl TiktokenBuffer {
+    // Based on 
https://github.com/PyO3/pyo3/blob/v0.22.2/tests/test_buffer_protocol.rs#L25
+    unsafe fn __getbuffer__(
+        slf: Bound<'_, Self>,
+        view: *mut pyo3::ffi::Py_buffer,
+        flags: std::os::raw::c_int,
+    ) -> PyResult<()> {
+        if view.is_null() {
+            return Err(pyo3::exceptions::PyBufferError::new_err("View is 
null"));
+        }
+        if (flags & pyo3::ffi::PyBUF_WRITABLE) == pyo3::ffi::PyBUF_WRITABLE {
+            return Err(pyo3::exceptions::PyBufferError::new_err(
+                "Object is not writable",
+            ));
+        }
+
+        (*view).obj = slf.clone().into_any().into_ptr();
+
+        let data = &slf.borrow().tokens;
+        (*view).buf = data.as_ptr() as *mut std::os::raw::c_void;
+        (*view).len = (data.len() * std::mem::size_of::<Rank>()) as isize;
+        (*view).readonly = 1;
+        (*view).itemsize = std::mem::size_of::<Rank>() as isize;
+        (*view).format = if (flags & pyo3::ffi::PyBUF_FORMAT) == 
pyo3::ffi::PyBUF_FORMAT {
+            let msg = std::ffi::CString::new("I").unwrap();
+            msg.into_raw()
+        } else {
+            std::ptr::null_mut()
+        };
+        (*view).ndim = 1;
+        (*view).shape = if (flags & pyo3::ffi::PyBUF_ND) == 
pyo3::ffi::PyBUF_ND {
+            &mut (*view).len
+        } else {
+            std::ptr::null_mut()
+        };
+        (*view).strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == 
pyo3::ffi::PyBUF_STRIDES {
+            &mut (*view).itemsize
+        } else {
+            std::ptr::null_mut()
+        };
+        (*view).suboffsets = std::ptr::null_mut();
+        (*view).internal = std::ptr::null_mut();
+
+        Ok(())
+    }
+
+    unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) {
+        std::mem::drop(std::ffi::CString::from_raw((*view).format));
+    }
+}
+
+#[pymodule]
+fn _tiktoken(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
+    m.add_class::<CoreBPE>()?;
+    Ok(())
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/tests/test_pickle.py 
new/tiktoken-0.9.0/tests/test_pickle.py
--- old/tiktoken-0.8.0/tests/test_pickle.py     1970-01-01 01:00:00.000000000 
+0100
+++ new/tiktoken-0.9.0/tests/test_pickle.py     2025-02-14 06:53:03.000000000 
+0100
@@ -0,0 +1,23 @@
+import tiktoken
+
+
+def test_pickle():
+    import pickle
+
+    enc_old = tiktoken.get_encoding("r50k_base")
+    enc_new = pickle.loads(pickle.dumps(enc_old))
+    assert enc_old.encode("hello world") == enc_new.encode("hello world")
+
+    enc_old = tiktoken.Encoding(
+        name="custom_enc",
+        pat_str=enc_old._pat_str,
+        mergeable_ranks=enc_old._mergeable_ranks,
+        special_tokens={"<|pickle|>": 100_000},
+    )
+    enc_new = pickle.loads(pickle.dumps(enc_old))
+    assert enc_old.encode("hello world") == enc_new.encode("hello world")
+    assert (
+        enc_old.encode("<|pickle|>", allowed_special="all")
+        == enc_new.encode("<|pickle|>", allowed_special="all")
+        == [100_000]
+    )
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/__init__.py 
new/tiktoken-0.9.0/tiktoken/__init__.py
--- old/tiktoken-0.8.0/tiktoken/__init__.py     2024-10-03 23:15:34.000000000 
+0200
+++ new/tiktoken-0.9.0/tiktoken/__init__.py     2025-02-14 06:53:03.000000000 
+0100
@@ -5,4 +5,4 @@
 from .registry import get_encoding as get_encoding
 from .registry import list_encoding_names as list_encoding_names
 
-__version__ = "0.8.0"
+__version__ = "0.9.0"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/core.py 
new/tiktoken-0.9.0/tiktoken/core.py
--- old/tiktoken-0.8.0/tiktoken/core.py 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/tiktoken/core.py 2025-02-14 06:53:03.000000000 +0100
@@ -2,12 +2,16 @@
 
 import functools
 from concurrent.futures import ThreadPoolExecutor
-from typing import AbstractSet, Collection, Literal, NoReturn, Sequence
+from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, 
Sequence
 
 import regex
 
 from tiktoken import _tiktoken
 
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
 
 class Encoding:
     def __init__(
@@ -128,6 +132,32 @@
             text = text.encode("utf-16", "surrogatepass").decode("utf-16", 
"replace")
             return self._core_bpe.encode(text, allowed_special)
 
+    def encode_to_numpy(
+        self,
+        text: str,
+        *,
+        allowed_special: Literal["all"] | AbstractSet[str] = set(),  # noqa: 
B006
+        disallowed_special: Literal["all"] | Collection[str] = "all",
+    ) -> npt.NDArray[np.uint32]:
+        """Encodes a string into tokens, returning a numpy array.
+
+        Avoids the overhead of copying the token buffer into a Python list.
+        """
+        if allowed_special == "all":
+            allowed_special = self.special_tokens_set
+        if disallowed_special == "all":
+            disallowed_special = self.special_tokens_set - allowed_special
+        if disallowed_special:
+            if not isinstance(disallowed_special, frozenset):
+                disallowed_special = frozenset(disallowed_special)
+            if match := _special_token_regex(disallowed_special).search(text):
+                raise_disallowed_special_token(match.group())
+
+        import numpy as np
+
+        buffer = self._core_bpe.encode_to_tiktoken_buffer(text, 
self.special_tokens_set)
+        return np.frombuffer(buffer, dtype=np.uint32)
+
     def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) 
-> list[list[int]]:
         """Encodes a list of strings into tokens, in parallel, ignoring 
special tokens.
 
@@ -332,6 +362,10 @@
     def special_tokens_set(self) -> set[str]:
         return set(self._special_tokens.keys())
 
+    def is_special_token(self, token: int) -> bool:
+        assert isinstance(token, int)
+        return token in self._special_token_values
+
     @property
     def n_vocab(self) -> int:
         """For backwards compatibility. Prefer to use `enc.max_token_value + 
1`."""
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/load.py 
new/tiktoken-0.9.0/tiktoken/load.py
--- old/tiktoken-0.8.0/tiktoken/load.py 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/tiktoken/load.py 2025-02-14 06:53:03.000000000 +0100
@@ -2,12 +2,7 @@
 
 import base64
 import hashlib
-import json
 import os
-import tempfile
-import uuid
-
-import requests
 
 
 def read_file(blobpath: str) -> bytes:
@@ -20,7 +15,10 @@
             ) from e
         with blobfile.BlobFile(blobpath, "rb") as f:
             return f.read()
+
     # avoiding blobfile for public files helps avoid auth issues, like MFA 
prompts
+    import requests
+
     resp = requests.get(blobpath)
     resp.raise_for_status()
     return resp.content
@@ -38,6 +36,8 @@
     elif "DATA_GYM_CACHE_DIR" in os.environ:
         cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
     else:
+        import tempfile
+
         cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
         user_specified_cache = False
 
@@ -67,6 +67,8 @@
             f"This may indicate a corrupted download. Please try again."
         )
 
+    import uuid
+
     try:
         os.makedirs(cache_dir, exist_ok=True)
         tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
@@ -114,6 +116,8 @@
         bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n
         n += 1
 
+    import json
+
     # check that the encoder file matches the merges file
     # this sanity check is important since tiktoken assumes that ranks are 
ordered the same
     # as merge priority
@@ -142,7 +146,13 @@
 def load_tiktoken_bpe(tiktoken_bpe_file: str, expected_hash: str | None = 
None) -> dict[bytes, int]:
     # NB: do not add caching to this function
     contents = read_file_cached(tiktoken_bpe_file, expected_hash)
-    return {
-        base64.b64decode(token): int(rank)
-        for token, rank in (line.split() for line in contents.splitlines() if 
line)
-    }
+    ret = {}
+    for line in contents.splitlines():
+        if not line:
+            continue
+        try:
+            token, rank = line.split()
+            ret[base64.b64decode(token)] = int(rank)
+        except Exception as e:
+            raise ValueError(f"Error parsing line {line!r} in 
{tiktoken_bpe_file}") from e
+    return ret
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/model.py 
new/tiktoken-0.9.0/tiktoken/model.py
--- old/tiktoken-0.8.0/tiktoken/model.py        2024-10-03 23:15:34.000000000 
+0200
+++ new/tiktoken-0.9.0/tiktoken/model.py        2025-02-14 06:53:03.000000000 
+0100
@@ -6,6 +6,7 @@
 # TODO: these will likely be replaced by an API endpoint
 MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     "o1-": "o200k_base",
+    "o3-": "o200k_base",
     # chat
     "chatgpt-4o-": "o200k_base",
     "gpt-4o-": "o200k_base",  # e.g., gpt-4o-2024-05-13
@@ -13,6 +14,7 @@
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
     "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
     # fine-tuned
+    "ft:gpt-4o": "o200k_base",
     "ft:gpt-4": "cl100k_base",
     "ft:gpt-3.5-turbo": "cl100k_base",
     "ft:davinci-002": "cl100k_base",
@@ -20,6 +22,9 @@
 }
 
 MODEL_TO_ENCODING: dict[str, str] = {
+    # reasoning
+    "o1": "o200k_base",
+    "o3": "o200k_base",
     # chat
     "gpt-4o": "o200k_base",
     "gpt-4": "cl100k_base",

++++++ vendor.tar.zst ++++++
++++ 564379 lines of diff (skipped)

commit python-tiktoken for openSUSE:Factory

Reply via email to