Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-tiktoken for openSUSE:Factory checked in at 2025-03-05 13:42:48 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-tiktoken (Old) and /work/SRC/openSUSE:Factory/.python-tiktoken.new.19136 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-tiktoken" Wed Mar 5 13:42:48 2025 rev:3 rq:1250388 version:0.9.0 Changes: -------- --- /work/SRC/openSUSE:Factory/python-tiktoken/python-tiktoken.changes 2024-11-21 15:19:09.642783278 +0100 +++ /work/SRC/openSUSE:Factory/.python-tiktoken.new.19136/python-tiktoken.changes 2025-03-05 13:42:50.278375137 +0100 @@ -1,0 +2,11 @@ +Wed Mar 5 09:34:31 UTC 2025 - John Paul Adrian Glaubitz <adrian.glaub...@suse.com> + +- Update to version 0.9.0: + * Join artifacts + * Partial sync of codebase + * Partial sync of codebase (#381) + * Add a link to PyPI in README (#318) + * Improve aarch64 and mac builds (#380) + * Partial sync of codebase (#379) + +------------------------------------------------------------------- Old: ---- tiktoken-0.8.0.tar.zst New: ---- tiktoken-0.9.0.tar.zst ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-tiktoken.spec ++++++ --- /var/tmp/diff_new_pack.tPIVil/_old 2025-03-05 13:42:51.186413138 +0100 +++ /var/tmp/diff_new_pack.tPIVil/_new 2025-03-05 13:42:51.190413305 +0100 @@ -1,7 +1,7 @@ # # spec file for package python-tiktoken # -# Copyright (c) 2024 SUSE LLC +# Copyright (c) 2025 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -18,7 +18,7 @@ %{?sle15_python_module_pythons} Name: python-tiktoken -Version: 0.8.0 +Version: 0.9.0 Release: 0 Summary: Fast BPE tokeniser for use with OpenAI's models License: MIT ++++++ _service ++++++ --- /var/tmp/diff_new_pack.tPIVil/_old 2025-03-05 13:42:51.222414645 +0100 +++ /var/tmp/diff_new_pack.tPIVil/_new 2025-03-05 13:42:51.226414812 +0100 @@ -3,7 +3,7 @@ <param name="url">https://github.com/openai/tiktoken.git</param> <param name="versionformat">@PARENT_TAG@</param> <param name="scm">git</param> - <param name="revision">0.8.0</param> + <param name="revision">0.9.0</param> <param name="match-tag">*</param> <param name="versionrewrite-pattern">v(\d+\.\d+\.\d+)</param> <param name="versionrewrite-replacement">\1</param> ++++++ _servicedata ++++++ --- /var/tmp/diff_new_pack.tPIVil/_old 2025-03-05 13:42:51.246415649 +0100 +++ /var/tmp/diff_new_pack.tPIVil/_new 2025-03-05 13:42:51.250415816 +0100 @@ -1,6 +1,6 @@ <servicedata> <service name="tar_scm"> <param name="url">https://github.com/openai/tiktoken.git</param> - <param name="changesrevision">63527649963def8c759b0f91f2eb69a40934e468</param></service></servicedata> + <param name="changesrevision">e35ab0915e37b919946b70947f1d0854196cb72c</param></service></servicedata> (No newline at EOF) ++++++ tiktoken-0.8.0.tar.zst -> tiktoken-0.9.0.tar.zst ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/.github/workflows/build_wheels.yml new/tiktoken-0.9.0/.github/workflows/build_wheels.yml --- old/tiktoken-0.8.0/.github/workflows/build_wheels.yml 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/.github/workflows/build_wheels.yml 2025-02-14 06:53:03.000000000 +0100 @@ -22,7 +22,7 @@ steps: - uses: actions/checkout@v4 - - uses: pypa/cibuildwheel@v2.21.2 + - uses: pypa/cibuildwheel@v2.22.0 env: CIBW_BUILD: "cp${{ matrix.python-version}}-*" @@ -38,19 +38,14 @@ strategy: fail-fast: false matrix: - os: [ubuntu-latest] + os: [ubuntu-22.04-arm] python-version: [39, 310, 311, 312, 313] steps: - uses: actions/checkout@v4 - - name: Setup up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: arm64 - - name: Build wheels - uses: pypa/cibuildwheel@v2.21.2 + uses: pypa/cibuildwheel@v2.22.0 env: CIBW_BUILD: "cp${{ matrix.python-version}}-*" CIBW_ARCHS: aarch64 @@ -85,3 +80,15 @@ with: name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} path: ./dist/*.tar.gz + + join_artifacts: + name: Join artifacts + runs-on: ubuntu-latest + needs: [build_wheels, build_wheels_aarch64, build_sdist] + steps: + - name: Merge artifacts + uses: actions/upload-artifact/merge@v4 + with: + name: cibw-wheels + pattern: cibw-wheels-* + delete-merged: true diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/CHANGELOG.md new/tiktoken-0.9.0/CHANGELOG.md --- old/tiktoken-0.8.0/CHANGELOG.md 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/CHANGELOG.md 2025-02-14 06:53:03.000000000 +0100 @@ -2,6 +2,12 @@ This is the changelog for the open source version of tiktoken. +## [v0.9.0] +- Support for `o1` and `o3` models +- Better error messages when loading invalid vocabulary files +- Support for encoding to numpy arrays +- Delayed imports when not strictly necessary + ## [v0.8.0] - Support for `o1-` and `chatgpt-4o-` models diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/Cargo.toml new/tiktoken-0.9.0/Cargo.toml --- old/tiktoken-0.8.0/Cargo.toml 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/Cargo.toml 2025-02-14 06:53:03.000000000 +0100 @@ -1,15 +1,24 @@ [package] name = "tiktoken" -version = "0.8.0" +version = "0.9.0" edition = "2021" rust-version = "1.57.0" [lib] -name = "_tiktoken" -crate-type = ["cdylib"] +name = "tiktoken" +crate-type = ["cdylib", "rlib"] + +[features] +default = [] +python = [ + "pyo3", +] [dependencies] -pyo3 = { version = "0.22.2", default-features = false, features = ["extension-module", "macros"] } +pyo3 = { version = "0.22.2", default-features = false, features = [ + "extension-module", + "macros", +], optional = true } # tiktoken dependencies fancy-regex = "0.13.0" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/README.md new/tiktoken-0.9.0/README.md --- old/tiktoken-0.8.0/README.md 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/README.md 2025-02-14 06:53:03.000000000 +0100 @@ -12,7 +12,7 @@ enc = tiktoken.encoding_for_model("gpt-4o") ``` -The open source version of `tiktoken` can be installed from PyPI: +The open source version of `tiktoken` can be installed from [PyPI](https://pypi.org/project/tiktoken): ``` pip install tiktoken ``` diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/pyproject.toml new/tiktoken-0.9.0/pyproject.toml --- old/tiktoken-0.8.0/pyproject.toml 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/pyproject.toml 2025-02-14 06:53:03.000000000 +0100 @@ -1,12 +1,12 @@ [project] name = "tiktoken" -version = "0.8.0" +version = "0.9.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" -license = {file = "LICENSE"} -authors = [{name = "Shantanu Jain"}, {email = "shant...@openai.com"}] +license = { file = "LICENSE" } +authors = [{ name = "Shantanu Jain" }, { email = "shant...@openai.com" }] dependencies = ["regex>=2022.1.18", "requests>=2.26.0"] -optional-dependencies = {blobfile = ["blobfile>=2"]} +optional-dependencies = { blobfile = ["blobfile>=2"] } requires-python = ">=3.9" [project.urls] @@ -22,9 +22,10 @@ build-frontend = "build" build-verbosity = 1 -linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y" +linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal" linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" } macos.before-all = "rustup target add aarch64-apple-darwin x86_64-apple-darwin" +macos.environment = { MACOSX_DEPLOYMENT_TARGET = "10.12" } skip = [ "*-manylinux_i686", @@ -39,7 +40,3 @@ before-test = "pip install pytest hypothesis" test-command = "pytest {project}/tests --import-mode=append" - -[[tool.cibuildwheel.overrides]] -select = "*linux_aarch64" -test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'""" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/setup.py new/tiktoken-0.9.0/setup.py --- old/tiktoken-0.8.0/setup.py 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/setup.py 2025-02-14 06:53:03.000000000 +0100 @@ -10,6 +10,7 @@ # Between our use of editable installs and wanting to use Rust for performance sensitive # code, it makes sense to just always use --release debug=False, + features=["python"], ) ], package_data={"tiktoken": ["py.typed"]}, diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/src/lib.rs new/tiktoken-0.9.0/src/lib.rs --- old/tiktoken-0.8.0/src/lib.rs 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/src/lib.rs 2025-02-14 06:53:03.000000000 +0100 @@ -1,19 +1,18 @@ -// This check is new and seems buggy (possibly with PyO3 interaction) -#![allow(clippy::borrow_deref_ref)] - +use std::borrow::Borrow; +use std::borrow::Cow; use std::collections::HashSet; use std::num::NonZeroU64; use std::thread; use fancy_regex::Regex; -use pyo3::exceptions; +#[cfg(feature = "python")] use pyo3::prelude::*; -use pyo3::pybacked::PyBackedStr; -use pyo3::types::{PyBytes, PyList, PyTuple}; -use pyo3::PyResult; use rustc_hash::FxHashMap as HashMap; -type Rank = u32; +#[cfg(feature = "python")] +mod py; + +pub type Rank = u32; fn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> Vec<(usize, Rank)> { // This is a vector of (start, rank). @@ -132,7 +131,7 @@ // The current implementation ends up doing a lot of hashing of bytes. In theory, this could be made // to be hashing of two-tuples of ints, which looks like it may also be a couple percent faster. -pub struct FakeThreadId(NonZeroU64); +struct FakeThreadId(NonZeroU64); fn hash_current_thread() -> usize { // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter @@ -148,8 +147,8 @@ } #[derive(Debug, Clone)] -struct DecodeKeyError { - token: Rank, +pub struct DecodeKeyError { + pub token: Rank, } impl std::fmt::Display for DecodeKeyError { @@ -158,10 +157,26 @@ } } +impl std::error::Error for DecodeKeyError {} + +#[derive(Debug, Clone)] +pub struct DecodeError { + pub message: String, +} + +impl std::fmt::Display for DecodeError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Could not decode tokens: {}", self.message) + } +} + +impl std::error::Error for DecodeError {} + const MAX_NUM_THREADS: usize = 128; -#[pyclass] -struct CoreBPE { +#[cfg_attr(feature = "python", pyclass)] +#[derive(Clone)] +pub struct CoreBPE { encoder: HashMap<Vec<u8>, Rank>, special_tokens_encoder: HashMap<String, Rank>, decoder: HashMap<Rank, Vec<u8>>, @@ -183,7 +198,10 @@ &self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS] } - fn _decode_native(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError> { + /// Decodes tokens into a list of bytes. + /// + /// The bytes are not gauranteed to be a valid utf-8 string. + fn decode_bytes(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError> { let mut ret = Vec::with_capacity(tokens.len() * 2); for &token in tokens { let token_bytes = match self.decoder.get(&token) { @@ -198,7 +216,7 @@ Ok(ret) } - fn _encode_ordinary_native(&self, text: &str) -> Vec<Rank> { + pub fn encode_ordinary(&self, text: &str) -> Vec<Rank> { // This is the core of the encoding logic; the other functions in here // just make things complicated :-) let regex = self._get_tl_regex(); @@ -213,7 +231,7 @@ ret } - fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec<Rank>, usize) { + pub fn encode(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec<Rank>, usize) { let special_regex = self._get_tl_special_regex(); let regex = self._get_tl_regex(); let mut ret = vec![]; @@ -308,12 +326,12 @@ (tokens, last_piece_token_len) } - fn _encode_unstable_native( + pub fn _encode_unstable_native( &self, text: &str, allowed_special: &HashSet<&str>, ) -> (Vec<Rank>, HashSet<Vec<Rank>>) { - let (tokens, last_piece_token_len) = self._encode_native(text, allowed_special); + let (tokens, last_piece_token_len) = self.encode(text, allowed_special); if last_piece_token_len == 0 { // If last_piece_token_len is zero, the last token was a special token and we have // no unstable bytes @@ -323,7 +341,7 @@ self._increase_last_piece_token_len(tokens, last_piece_token_len); let unstable_bytes = self - ._decode_native(&tokens[tokens.len() - last_piece_token_len..]) + .decode_bytes(&tokens[tokens.len() - last_piece_token_len..]) .unwrap(); tokens.truncate(tokens.len() - last_piece_token_len); @@ -372,7 +390,7 @@ // So convert to UTF-8 and do regex splitting. // E.g. with cl100k_base " !" gets split to " " + " !", // but byte_pair_encode(" !") != byte_pair_encode(" ") - Ok(s) => self._encode_ordinary_native(s), + Ok(s) => self.encode_ordinary(s), // Technically, whether or not this arm is correct depends on whether there // would be a regex split before the UTF-8 truncation point. @@ -425,26 +443,37 @@ (tokens, completions) } -} -#[pymethods] -impl CoreBPE { - #[new] - fn new( + pub fn new<E, SE, NSE>( + encoder: E, + special_tokens_encoder: SE, + pattern: &str, + ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> + where + E: IntoIterator<Item = (Vec<u8>, Rank)>, + SE: IntoIterator<Item = (String, Rank)>, + NSE: IntoIterator<Item = (String, (Rank, Rank))>, + { + Self::new_internal( + HashMap::from_iter(encoder), + HashMap::from_iter(special_tokens_encoder), + pattern, + ) + } + + fn new_internal( encoder: HashMap<Vec<u8>, Rank>, special_tokens_encoder: HashMap<String, Rank>, pattern: &str, - ) -> PyResult<Self> { - let regex = Regex::new(pattern) - .map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))?; + ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> { + let regex = Regex::new(pattern)?; let special_regex = { - let _parts = special_tokens_encoder + let parts = special_tokens_encoder .keys() .map(|s| fancy_regex::escape(s)) .collect::<Vec<_>>(); - Regex::new(&_parts.join("|")) - .map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))? + Regex::new(&parts.join("|"))? }; let decoder: HashMap<Rank, Vec<u8>> = @@ -464,7 +493,7 @@ let mut sorted_token_bytes: Vec<Vec<u8>> = encoder.keys().cloned().collect(); sorted_token_bytes.sort(); - Ok(CoreBPE { + Ok(Self { encoder, special_tokens_encoder, decoder, @@ -477,208 +506,22 @@ }) } - // ==================== - // Encoding - // ==================== - - fn encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> { - py.allow_threads(|| self._encode_ordinary_native(text)) - } - - fn encode(&self, py: Python, text: &str, allowed_special: HashSet<PyBackedStr>) -> Vec<Rank> { - py.allow_threads(|| { - let allowed_special: HashSet<&str> = - allowed_special.iter().map(|s| s.as_ref()).collect(); - self._encode_native(text, &allowed_special).0 - }) - } - - fn encode_to_tiktoken_buffer( - &self, - py: Python, - text: &str, - allowed_special: HashSet<PyBackedStr>, - ) -> Py<PyAny> { - let tokens = py.allow_threads(|| { - let allowed_special: HashSet<&str> = - allowed_special.iter().map(|s| s.as_ref()).collect(); - self._encode_native(text, &allowed_special).0 - }); - let buffer = TiktokenBuffer { tokens }; - buffer.into_py(py) - } - - fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> { - py.allow_threads(|| { - match std::str::from_utf8(bytes) { - Ok(text) => self._encode_ordinary_native(text), - Err(e) => { - let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) }; - let (tokens, last_piece_token_len) = self._encode_native(text, &HashSet::new()); - let (mut tokens, last_piece_token_len) = - self._increase_last_piece_token_len(tokens, last_piece_token_len); - if !tokens.is_empty() && last_piece_token_len > 0 { - // Lop off the tokens from the last piece and run BPE on the remaining bytes - // Somewhat niche, but this may not be correct if we'd have had a regex - // split between the valid UTF-8 and the invalid bytes, which is why this - // method is private - let mut unstable_bytes = self - ._decode_native(&tokens[tokens.len() - last_piece_token_len..]) - .unwrap(); - unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]); - - tokens.truncate(tokens.len() - last_piece_token_len); - match self.encoder.get(&unstable_bytes) { - Some(token) => tokens.push(*token), - None => { - tokens.extend(&byte_pair_encode(&unstable_bytes, &self.encoder)) - } - } - } - tokens - } - } - }) - } - - fn encode_with_unstable( - &self, - py: Python, - text: &str, - allowed_special: HashSet<PyBackedStr>, - ) -> Py<PyTuple> { - let (tokens, completions) = py.allow_threads(|| { - let allowed_special: HashSet<&str> = - allowed_special.iter().map(|s| s.as_ref()).collect(); - self._encode_unstable_native(text, &allowed_special) - }); - let py_completions = PyList::new_bound( - py, - completions - .iter() - .map(|seq| PyList::new_bound(py, &seq[..])), - ); - (tokens, py_completions).into_py(py) - } - - fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> { - if let Some(token) = self.encoder.get(piece).copied() { - return Ok(token); - } - if let Ok(piece_str) = std::str::from_utf8(piece) { - if let Some(token) = self.special_tokens_encoder.get(piece_str).copied() { - return Ok(token); - } - } - Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned())) - } - - fn encode_single_piece(&self, piece: &[u8]) -> Vec<Rank> { - if let Some(token) = self.encoder.get(piece) { - return vec![*token]; - } - byte_pair_encode(piece, &self.encoder) - } - - // ==================== - // Decoding - // ==================== - - fn decode_bytes(&self, py: Python, tokens: Vec<Rank>) -> Result<Py<PyBytes>, PyErr> { - match py.allow_threads(|| self._decode_native(&tokens)) { - Ok(bytes) => Ok(PyBytes::new_bound(py, &bytes).into()), - Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}", e))), - } - } - - fn decode_single_token_bytes(&self, py: Python, token: Rank) -> PyResult<Py<PyBytes>> { - if let Some(bytes) = self.decoder.get(&token) { - return Ok(PyBytes::new_bound(py, bytes).into()); - } - if let Some(bytes) = self.special_tokens_decoder.get(&token) { - return Ok(PyBytes::new_bound(py, bytes).into()); - } - Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string())) - } - - // ==================== - // Miscellaneous - // ==================== - - fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> { - self.sorted_token_bytes - .iter() - .map(|x| PyBytes::new_bound(py, x).into()) + pub fn special_tokens(&self) -> HashSet<&str> { + self.special_tokens_encoder + .keys() + .map(|s| s.as_str()) .collect() } -} - -#[pyclass] -struct TiktokenBuffer { - tokens: Vec<Rank>, -} - -#[pymethods] -impl TiktokenBuffer { - // Based on https://github.com/PyO3/pyo3/blob/v0.22.2/tests/test_buffer_protocol.rs#L25 - unsafe fn __getbuffer__( - slf: Bound<'_, Self>, - view: *mut pyo3::ffi::Py_buffer, - flags: std::os::raw::c_int, - ) -> PyResult<()> { - if view.is_null() { - return Err(pyo3::exceptions::PyBufferError::new_err("View is null")); - } - if (flags & pyo3::ffi::PyBUF_WRITABLE) == pyo3::ffi::PyBUF_WRITABLE { - return Err(pyo3::exceptions::PyBufferError::new_err( - "Object is not writable", - )); - } - - (*view).obj = slf.clone().into_any().into_ptr(); - - let data = &slf.borrow().tokens; - (*view).buf = data.as_ptr() as *mut std::os::raw::c_void; - (*view).len = (data.len() * std::mem::size_of::<Rank>()) as isize; - (*view).readonly = 1; - (*view).itemsize = std::mem::size_of::<Rank>() as isize; - (*view).format = if (flags & pyo3::ffi::PyBUF_FORMAT) == pyo3::ffi::PyBUF_FORMAT { - let msg = std::ffi::CString::new("I").unwrap(); - msg.into_raw() - } else { - std::ptr::null_mut() - }; - (*view).ndim = 1; - (*view).shape = if (flags & pyo3::ffi::PyBUF_ND) == pyo3::ffi::PyBUF_ND { - &mut (*view).len - } else { - std::ptr::null_mut() - }; - (*view).strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == pyo3::ffi::PyBUF_STRIDES { - &mut (*view).itemsize - } else { - std::ptr::null_mut() - }; - (*view).suboffsets = std::ptr::null_mut(); - (*view).internal = std::ptr::null_mut(); - - Ok(()) - } - unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) { - std::mem::drop(std::ffi::CString::from_raw((*view).format)); + pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> { + let allowed_special = self.special_tokens(); + self.encode(text, &allowed_special).0 } } -#[pymodule] -fn _tiktoken(_py: Python, m: &Bound<PyModule>) -> PyResult<()> { - m.add_class::<CoreBPE>()?; - Ok(()) -} - #[cfg(test)] mod tests { - + use fancy_regex::Regex; use rustc_hash::FxHashMap as HashMap; use crate::{byte_pair_split, Rank}; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/src/py.rs new/tiktoken-0.9.0/src/py.rs --- old/tiktoken-0.8.0/src/py.rs 1970-01-01 01:00:00.000000000 +0100 +++ new/tiktoken-0.9.0/src/py.rs 2025-02-14 06:53:03.000000000 +0100 @@ -0,0 +1,236 @@ +use std::collections::HashSet; + +use pyo3::{ + exceptions, + prelude::*, + pybacked::PyBackedStr, + types::{PyBytes, PyList, PyTuple}, + PyResult, +}; +use rustc_hash::FxHashMap as HashMap; + +use crate::{byte_pair_encode, CoreBPE, Rank}; + +#[pymethods] +impl CoreBPE { + #[new] + fn py_new( + encoder: HashMap<Vec<u8>, Rank>, + special_tokens_encoder: HashMap<String, Rank>, + pattern: &str, + ) -> PyResult<Self> { + Self::new_internal( + encoder, + special_tokens_encoder, + pattern, + ) + .map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string())) + } + + // ==================== + // Encoding + // ==================== + + #[pyo3(name = "encode_ordinary")] + fn py_encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> { + py.allow_threads(|| self.encode_ordinary(text)) + } + + #[pyo3(name = "encode")] + fn py_encode( + &self, + py: Python, + text: &str, + allowed_special: HashSet<PyBackedStr>, + ) -> Vec<Rank> { + py.allow_threads(|| { + let allowed_special: HashSet<&str> = + allowed_special.iter().map(|s| s.as_ref()).collect(); + self.encode(text, &allowed_special).0 + }) + } + + fn encode_to_tiktoken_buffer( + &self, + py: Python, + text: &str, + allowed_special: HashSet<PyBackedStr>, + ) -> Py<PyAny> { + let tokens = py.allow_threads(|| { + let allowed_special: HashSet<&str> = + allowed_special.iter().map(|s| s.as_ref()).collect(); + self.encode(text, &allowed_special).0 + }); + let buffer = TiktokenBuffer { tokens }; + buffer.into_py(py) + } + + fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> { + py.allow_threads(|| { + match std::str::from_utf8(bytes) { + Ok(text) => self.encode_ordinary(text), + Err(e) => { + let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) }; + let (tokens, last_piece_token_len) = self.encode(text, &HashSet::new()); + let (mut tokens, last_piece_token_len) = + self._increase_last_piece_token_len(tokens, last_piece_token_len); + if !tokens.is_empty() && last_piece_token_len > 0 { + // Lop off the tokens from the last piece and run BPE on the remaining bytes + // Somewhat niche, but this may not be correct if we'd have had a regex + // split between the valid UTF-8 and the invalid bytes, which is why this + // method is private + let mut unstable_bytes = self + .decode_bytes(&tokens[tokens.len() - last_piece_token_len..]) + .unwrap(); + unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]); + + tokens.truncate(tokens.len() - last_piece_token_len); + match self.encoder.get(&unstable_bytes) { + Some(token) => tokens.push(*token), + None => { + tokens.extend(&byte_pair_encode(&unstable_bytes, &self.encoder)) + } + } + } + tokens + } + } + }) + } + + #[pyo3(name = "encode_with_unstable")] + fn py_encode_with_unstable( + &self, + py: Python, + text: &str, + allowed_special: HashSet<PyBackedStr>, + ) -> Py<PyTuple> { + let (tokens, completions) = py.allow_threads(|| { + let allowed_special: HashSet<&str> = + allowed_special.iter().map(|s| s.as_ref()).collect(); + self._encode_unstable_native(text, &allowed_special) + }); + let py_completions = PyList::new_bound( + py, + completions + .iter() + .map(|seq| PyList::new_bound(py, &seq[..])), + ); + (tokens, py_completions).into_py(py) + } + + fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> { + if let Some(token) = self.encoder.get(piece).copied() { + return Ok(token); + } + if let Ok(piece_str) = std::str::from_utf8(piece) { + if let Some(token) = self.special_tokens_encoder.get(piece_str).copied() { + return Ok(token); + } + } + Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned())) + } + + fn encode_single_piece(&self, piece: &[u8]) -> Vec<Rank> { + if let Some(token) = self.encoder.get(piece) { + return vec![*token]; + } + byte_pair_encode(piece, &self.encoder) + } + + // ==================== + // Decoding + // ==================== + + #[pyo3(name = "decode_bytes")] + fn py_decode_bytes(&self, py: Python, tokens: Vec<Rank>) -> Result<Py<PyBytes>, PyErr> { + match py.allow_threads(|| self.decode_bytes(&tokens)) { + Ok(bytes) => Ok(PyBytes::new_bound(py, &bytes).into()), + Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}", e))), + } + } + + fn decode_single_token_bytes(&self, py: Python, token: Rank) -> PyResult<Py<PyBytes>> { + if let Some(bytes) = self.decoder.get(&token) { + return Ok(PyBytes::new_bound(py, bytes).into()); + } + if let Some(bytes) = self.special_tokens_decoder.get(&token) { + return Ok(PyBytes::new_bound(py, bytes).into()); + } + Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string())) + } + + // ==================== + // Miscellaneous + // ==================== + + fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> { + self.sorted_token_bytes + .iter() + .map(|x| PyBytes::new_bound(py, x).into()) + .collect() + } +} + +#[pyclass] +struct TiktokenBuffer { + tokens: Vec<Rank>, +} + +#[pymethods] +impl TiktokenBuffer { + // Based on https://github.com/PyO3/pyo3/blob/v0.22.2/tests/test_buffer_protocol.rs#L25 + unsafe fn __getbuffer__( + slf: Bound<'_, Self>, + view: *mut pyo3::ffi::Py_buffer, + flags: std::os::raw::c_int, + ) -> PyResult<()> { + if view.is_null() { + return Err(pyo3::exceptions::PyBufferError::new_err("View is null")); + } + if (flags & pyo3::ffi::PyBUF_WRITABLE) == pyo3::ffi::PyBUF_WRITABLE { + return Err(pyo3::exceptions::PyBufferError::new_err( + "Object is not writable", + )); + } + + (*view).obj = slf.clone().into_any().into_ptr(); + + let data = &slf.borrow().tokens; + (*view).buf = data.as_ptr() as *mut std::os::raw::c_void; + (*view).len = (data.len() * std::mem::size_of::<Rank>()) as isize; + (*view).readonly = 1; + (*view).itemsize = std::mem::size_of::<Rank>() as isize; + (*view).format = if (flags & pyo3::ffi::PyBUF_FORMAT) == pyo3::ffi::PyBUF_FORMAT { + let msg = std::ffi::CString::new("I").unwrap(); + msg.into_raw() + } else { + std::ptr::null_mut() + }; + (*view).ndim = 1; + (*view).shape = if (flags & pyo3::ffi::PyBUF_ND) == pyo3::ffi::PyBUF_ND { + &mut (*view).len + } else { + std::ptr::null_mut() + }; + (*view).strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == pyo3::ffi::PyBUF_STRIDES { + &mut (*view).itemsize + } else { + std::ptr::null_mut() + }; + (*view).suboffsets = std::ptr::null_mut(); + (*view).internal = std::ptr::null_mut(); + + Ok(()) + } + + unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) { + std::mem::drop(std::ffi::CString::from_raw((*view).format)); + } +} + +#[pymodule] +fn _tiktoken(_py: Python, m: &Bound<PyModule>) -> PyResult<()> { + m.add_class::<CoreBPE>()?; + Ok(()) +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/tests/test_pickle.py new/tiktoken-0.9.0/tests/test_pickle.py --- old/tiktoken-0.8.0/tests/test_pickle.py 1970-01-01 01:00:00.000000000 +0100 +++ new/tiktoken-0.9.0/tests/test_pickle.py 2025-02-14 06:53:03.000000000 +0100 @@ -0,0 +1,23 @@ +import tiktoken + + +def test_pickle(): + import pickle + + enc_old = tiktoken.get_encoding("r50k_base") + enc_new = pickle.loads(pickle.dumps(enc_old)) + assert enc_old.encode("hello world") == enc_new.encode("hello world") + + enc_old = tiktoken.Encoding( + name="custom_enc", + pat_str=enc_old._pat_str, + mergeable_ranks=enc_old._mergeable_ranks, + special_tokens={"<|pickle|>": 100_000}, + ) + enc_new = pickle.loads(pickle.dumps(enc_old)) + assert enc_old.encode("hello world") == enc_new.encode("hello world") + assert ( + enc_old.encode("<|pickle|>", allowed_special="all") + == enc_new.encode("<|pickle|>", allowed_special="all") + == [100_000] + ) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/__init__.py new/tiktoken-0.9.0/tiktoken/__init__.py --- old/tiktoken-0.8.0/tiktoken/__init__.py 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/tiktoken/__init__.py 2025-02-14 06:53:03.000000000 +0100 @@ -5,4 +5,4 @@ from .registry import get_encoding as get_encoding from .registry import list_encoding_names as list_encoding_names -__version__ = "0.8.0" +__version__ = "0.9.0" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/core.py new/tiktoken-0.9.0/tiktoken/core.py --- old/tiktoken-0.8.0/tiktoken/core.py 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/tiktoken/core.py 2025-02-14 06:53:03.000000000 +0100 @@ -2,12 +2,16 @@ import functools from concurrent.futures import ThreadPoolExecutor -from typing import AbstractSet, Collection, Literal, NoReturn, Sequence +from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, Sequence import regex from tiktoken import _tiktoken +if TYPE_CHECKING: + import numpy as np + import numpy.typing as npt + class Encoding: def __init__( @@ -128,6 +132,32 @@ text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") return self._core_bpe.encode(text, allowed_special) + def encode_to_numpy( + self, + text: str, + *, + allowed_special: Literal["all"] | AbstractSet[str] = set(), # noqa: B006 + disallowed_special: Literal["all"] | Collection[str] = "all", + ) -> npt.NDArray[np.uint32]: + """Encodes a string into tokens, returning a numpy array. + + Avoids the overhead of copying the token buffer into a Python list. + """ + if allowed_special == "all": + allowed_special = self.special_tokens_set + if disallowed_special == "all": + disallowed_special = self.special_tokens_set - allowed_special + if disallowed_special: + if not isinstance(disallowed_special, frozenset): + disallowed_special = frozenset(disallowed_special) + if match := _special_token_regex(disallowed_special).search(text): + raise_disallowed_special_token(match.group()) + + import numpy as np + + buffer = self._core_bpe.encode_to_tiktoken_buffer(text, self.special_tokens_set) + return np.frombuffer(buffer, dtype=np.uint32) + def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]: """Encodes a list of strings into tokens, in parallel, ignoring special tokens. @@ -332,6 +362,10 @@ def special_tokens_set(self) -> set[str]: return set(self._special_tokens.keys()) + def is_special_token(self, token: int) -> bool: + assert isinstance(token, int) + return token in self._special_token_values + @property def n_vocab(self) -> int: """For backwards compatibility. Prefer to use `enc.max_token_value + 1`.""" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/load.py new/tiktoken-0.9.0/tiktoken/load.py --- old/tiktoken-0.8.0/tiktoken/load.py 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/tiktoken/load.py 2025-02-14 06:53:03.000000000 +0100 @@ -2,12 +2,7 @@ import base64 import hashlib -import json import os -import tempfile -import uuid - -import requests def read_file(blobpath: str) -> bytes: @@ -20,7 +15,10 @@ ) from e with blobfile.BlobFile(blobpath, "rb") as f: return f.read() + # avoiding blobfile for public files helps avoid auth issues, like MFA prompts + import requests + resp = requests.get(blobpath) resp.raise_for_status() return resp.content @@ -38,6 +36,8 @@ elif "DATA_GYM_CACHE_DIR" in os.environ: cache_dir = os.environ["DATA_GYM_CACHE_DIR"] else: + import tempfile + cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache") user_specified_cache = False @@ -67,6 +67,8 @@ f"This may indicate a corrupted download. Please try again." ) + import uuid + try: os.makedirs(cache_dir, exist_ok=True) tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp" @@ -114,6 +116,8 @@ bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n n += 1 + import json + # check that the encoder file matches the merges file # this sanity check is important since tiktoken assumes that ranks are ordered the same # as merge priority @@ -142,7 +146,13 @@ def load_tiktoken_bpe(tiktoken_bpe_file: str, expected_hash: str | None = None) -> dict[bytes, int]: # NB: do not add caching to this function contents = read_file_cached(tiktoken_bpe_file, expected_hash) - return { - base64.b64decode(token): int(rank) - for token, rank in (line.split() for line in contents.splitlines() if line) - } + ret = {} + for line in contents.splitlines(): + if not line: + continue + try: + token, rank = line.split() + ret[base64.b64decode(token)] = int(rank) + except Exception as e: + raise ValueError(f"Error parsing line {line!r} in {tiktoken_bpe_file}") from e + return ret diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/model.py new/tiktoken-0.9.0/tiktoken/model.py --- old/tiktoken-0.8.0/tiktoken/model.py 2024-10-03 23:15:34.000000000 +0200 +++ new/tiktoken-0.9.0/tiktoken/model.py 2025-02-14 06:53:03.000000000 +0100 @@ -6,6 +6,7 @@ # TODO: these will likely be replaced by an API endpoint MODEL_PREFIX_TO_ENCODING: dict[str, str] = { "o1-": "o200k_base", + "o3-": "o200k_base", # chat "chatgpt-4o-": "o200k_base", "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13 @@ -13,6 +14,7 @@ "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. "gpt-35-turbo-": "cl100k_base", # Azure deployment name # fine-tuned + "ft:gpt-4o": "o200k_base", "ft:gpt-4": "cl100k_base", "ft:gpt-3.5-turbo": "cl100k_base", "ft:davinci-002": "cl100k_base", @@ -20,6 +22,9 @@ } MODEL_TO_ENCODING: dict[str, str] = { + # reasoning + "o1": "o200k_base", + "o3": "o200k_base", # chat "gpt-4o": "o200k_base", "gpt-4": "cl100k_base", ++++++ vendor.tar.zst ++++++ ++++ 564379 lines of diff (skipped)