Script 'mail_helper' called by obssrc
Hello community,
here is the log from the commit of package python-tiktoken for openSUSE:Factory
checked in at 2025-03-05 13:42:48
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-tiktoken (Old)
and /work/SRC/openSUSE:Factory/.python-tiktoken.new.19136 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-tiktoken"
Wed Mar 5 13:42:48 2025 rev:3 rq:1250388 version:0.9.0
Changes:
--------
--- /work/SRC/openSUSE:Factory/python-tiktoken/python-tiktoken.changes
2024-11-21 15:19:09.642783278 +0100
+++
/work/SRC/openSUSE:Factory/.python-tiktoken.new.19136/python-tiktoken.changes
2025-03-05 13:42:50.278375137 +0100
@@ -1,0 +2,11 @@
+Wed Mar 5 09:34:31 UTC 2025 - John Paul Adrian Glaubitz
<[email protected]>
+
+- Update to version 0.9.0:
+ * Join artifacts
+ * Partial sync of codebase
+ * Partial sync of codebase (#381)
+ * Add a link to PyPI in README (#318)
+ * Improve aarch64 and mac builds (#380)
+ * Partial sync of codebase (#379)
+
+-------------------------------------------------------------------
Old:
----
tiktoken-0.8.0.tar.zst
New:
----
tiktoken-0.9.0.tar.zst
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ python-tiktoken.spec ++++++
--- /var/tmp/diff_new_pack.tPIVil/_old 2025-03-05 13:42:51.186413138 +0100
+++ /var/tmp/diff_new_pack.tPIVil/_new 2025-03-05 13:42:51.190413305 +0100
@@ -1,7 +1,7 @@
#
# spec file for package python-tiktoken
#
-# Copyright (c) 2024 SUSE LLC
+# Copyright (c) 2025 SUSE LLC
#
# All modifications and additions to the file contributed by third parties
# remain the property of their copyright owners, unless otherwise agreed
@@ -18,7 +18,7 @@
%{?sle15_python_module_pythons}
Name: python-tiktoken
-Version: 0.8.0
+Version: 0.9.0
Release: 0
Summary: Fast BPE tokeniser for use with OpenAI's models
License: MIT
++++++ _service ++++++
--- /var/tmp/diff_new_pack.tPIVil/_old 2025-03-05 13:42:51.222414645 +0100
+++ /var/tmp/diff_new_pack.tPIVil/_new 2025-03-05 13:42:51.226414812 +0100
@@ -3,7 +3,7 @@
<param name="url">https://github.com/openai/tiktoken.git</param>
<param name="versionformat">@PARENT_TAG@</param>
<param name="scm">git</param>
- <param name="revision">0.8.0</param>
+ <param name="revision">0.9.0</param>
<param name="match-tag">*</param>
<param name="versionrewrite-pattern">v(\d+\.\d+\.\d+)</param>
<param name="versionrewrite-replacement">\1</param>
++++++ _servicedata ++++++
--- /var/tmp/diff_new_pack.tPIVil/_old 2025-03-05 13:42:51.246415649 +0100
+++ /var/tmp/diff_new_pack.tPIVil/_new 2025-03-05 13:42:51.250415816 +0100
@@ -1,6 +1,6 @@
<servicedata>
<service name="tar_scm">
<param
name="url">https://github.com/openai/tiktoken.git</param>
- <param
name="changesrevision">63527649963def8c759b0f91f2eb69a40934e468</param></service></servicedata>
+ <param
name="changesrevision">e35ab0915e37b919946b70947f1d0854196cb72c</param></service></servicedata>
(No newline at EOF)
++++++ tiktoken-0.8.0.tar.zst -> tiktoken-0.9.0.tar.zst ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/.github/workflows/build_wheels.yml
new/tiktoken-0.9.0/.github/workflows/build_wheels.yml
--- old/tiktoken-0.8.0/.github/workflows/build_wheels.yml 2024-10-03
23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/.github/workflows/build_wheels.yml 2025-02-14
06:53:03.000000000 +0100
@@ -22,7 +22,7 @@
steps:
- uses: actions/checkout@v4
- - uses: pypa/[email protected]
+ - uses: pypa/[email protected]
env:
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
@@ -38,19 +38,14 @@
strategy:
fail-fast: false
matrix:
- os: [ubuntu-latest]
+ os: [ubuntu-22.04-arm]
python-version: [39, 310, 311, 312, 313]
steps:
- uses: actions/checkout@v4
- - name: Setup up QEMU
- uses: docker/setup-qemu-action@v3
- with:
- platforms: arm64
-
- name: Build wheels
- uses: pypa/[email protected]
+ uses: pypa/[email protected]
env:
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
CIBW_ARCHS: aarch64
@@ -85,3 +80,15 @@
with:
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
path: ./dist/*.tar.gz
+
+ join_artifacts:
+ name: Join artifacts
+ runs-on: ubuntu-latest
+ needs: [build_wheels, build_wheels_aarch64, build_sdist]
+ steps:
+ - name: Merge artifacts
+ uses: actions/upload-artifact/merge@v4
+ with:
+ name: cibw-wheels
+ pattern: cibw-wheels-*
+ delete-merged: true
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/CHANGELOG.md
new/tiktoken-0.9.0/CHANGELOG.md
--- old/tiktoken-0.8.0/CHANGELOG.md 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/CHANGELOG.md 2025-02-14 06:53:03.000000000 +0100
@@ -2,6 +2,12 @@
This is the changelog for the open source version of tiktoken.
+## [v0.9.0]
+- Support for `o1` and `o3` models
+- Better error messages when loading invalid vocabulary files
+- Support for encoding to numpy arrays
+- Delayed imports when not strictly necessary
+
## [v0.8.0]
- Support for `o1-` and `chatgpt-4o-` models
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/Cargo.toml
new/tiktoken-0.9.0/Cargo.toml
--- old/tiktoken-0.8.0/Cargo.toml 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/Cargo.toml 2025-02-14 06:53:03.000000000 +0100
@@ -1,15 +1,24 @@
[package]
name = "tiktoken"
-version = "0.8.0"
+version = "0.9.0"
edition = "2021"
rust-version = "1.57.0"
[lib]
-name = "_tiktoken"
-crate-type = ["cdylib"]
+name = "tiktoken"
+crate-type = ["cdylib", "rlib"]
+
+[features]
+default = []
+python = [
+ "pyo3",
+]
[dependencies]
-pyo3 = { version = "0.22.2", default-features = false, features =
["extension-module", "macros"] }
+pyo3 = { version = "0.22.2", default-features = false, features = [
+ "extension-module",
+ "macros",
+], optional = true }
# tiktoken dependencies
fancy-regex = "0.13.0"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/README.md new/tiktoken-0.9.0/README.md
--- old/tiktoken-0.8.0/README.md 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/README.md 2025-02-14 06:53:03.000000000 +0100
@@ -12,7 +12,7 @@
enc = tiktoken.encoding_for_model("gpt-4o")
```
-The open source version of `tiktoken` can be installed from PyPI:
+The open source version of `tiktoken` can be installed from
[PyPI](https://pypi.org/project/tiktoken):
```
pip install tiktoken
```
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/pyproject.toml
new/tiktoken-0.9.0/pyproject.toml
--- old/tiktoken-0.8.0/pyproject.toml 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/pyproject.toml 2025-02-14 06:53:03.000000000 +0100
@@ -1,12 +1,12 @@
[project]
name = "tiktoken"
-version = "0.8.0"
+version = "0.9.0"
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
readme = "README.md"
-license = {file = "LICENSE"}
-authors = [{name = "Shantanu Jain"}, {email = "[email protected]"}]
+license = { file = "LICENSE" }
+authors = [{ name = "Shantanu Jain" }, { email = "[email protected]" }]
dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
-optional-dependencies = {blobfile = ["blobfile>=2"]}
+optional-dependencies = { blobfile = ["blobfile>=2"] }
requires-python = ">=3.9"
[project.urls]
@@ -22,9 +22,10 @@
build-frontend = "build"
build-verbosity = 1
-linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs
| sh -s -- -y"
+linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs
| sh -s -- -y --profile minimal"
linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
macos.before-all = "rustup target add aarch64-apple-darwin x86_64-apple-darwin"
+macos.environment = { MACOSX_DEPLOYMENT_TARGET = "10.12" }
skip = [
"*-manylinux_i686",
@@ -39,7 +40,3 @@
before-test = "pip install pytest hypothesis"
test-command = "pytest {project}/tests --import-mode=append"
-
-[[tool.cibuildwheel.overrides]]
-select = "*linux_aarch64"
-test-command = """python -c 'import tiktoken; enc =
tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373,
995]'"""
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/setup.py new/tiktoken-0.9.0/setup.py
--- old/tiktoken-0.8.0/setup.py 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/setup.py 2025-02-14 06:53:03.000000000 +0100
@@ -10,6 +10,7 @@
# Between our use of editable installs and wanting to use Rust for
performance sensitive
# code, it makes sense to just always use --release
debug=False,
+ features=["python"],
)
],
package_data={"tiktoken": ["py.typed"]},
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/src/lib.rs
new/tiktoken-0.9.0/src/lib.rs
--- old/tiktoken-0.8.0/src/lib.rs 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/src/lib.rs 2025-02-14 06:53:03.000000000 +0100
@@ -1,19 +1,18 @@
-// This check is new and seems buggy (possibly with PyO3 interaction)
-#![allow(clippy::borrow_deref_ref)]
-
+use std::borrow::Borrow;
+use std::borrow::Cow;
use std::collections::HashSet;
use std::num::NonZeroU64;
use std::thread;
use fancy_regex::Regex;
-use pyo3::exceptions;
+#[cfg(feature = "python")]
use pyo3::prelude::*;
-use pyo3::pybacked::PyBackedStr;
-use pyo3::types::{PyBytes, PyList, PyTuple};
-use pyo3::PyResult;
use rustc_hash::FxHashMap as HashMap;
-type Rank = u32;
+#[cfg(feature = "python")]
+mod py;
+
+pub type Rank = u32;
fn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) ->
Vec<(usize, Rank)> {
// This is a vector of (start, rank).
@@ -132,7 +131,7 @@
// The current implementation ends up doing a lot of hashing of bytes. In
theory, this could be made
// to be hashing of two-tuples of ints, which looks like it may also be a
couple percent faster.
-pub struct FakeThreadId(NonZeroU64);
+struct FakeThreadId(NonZeroU64);
fn hash_current_thread() -> usize {
// It's easier to use unsafe than to use nightly. Rust has this nice u64
thread id counter
@@ -148,8 +147,8 @@
}
#[derive(Debug, Clone)]
-struct DecodeKeyError {
- token: Rank,
+pub struct DecodeKeyError {
+ pub token: Rank,
}
impl std::fmt::Display for DecodeKeyError {
@@ -158,10 +157,26 @@
}
}
+impl std::error::Error for DecodeKeyError {}
+
+#[derive(Debug, Clone)]
+pub struct DecodeError {
+ pub message: String,
+}
+
+impl std::fmt::Display for DecodeError {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ write!(f, "Could not decode tokens: {}", self.message)
+ }
+}
+
+impl std::error::Error for DecodeError {}
+
const MAX_NUM_THREADS: usize = 128;
-#[pyclass]
-struct CoreBPE {
+#[cfg_attr(feature = "python", pyclass)]
+#[derive(Clone)]
+pub struct CoreBPE {
encoder: HashMap<Vec<u8>, Rank>,
special_tokens_encoder: HashMap<String, Rank>,
decoder: HashMap<Rank, Vec<u8>>,
@@ -183,7 +198,10 @@
&self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS]
}
- fn _decode_native(&self, tokens: &[Rank]) -> Result<Vec<u8>,
DecodeKeyError> {
+ /// Decodes tokens into a list of bytes.
+ ///
+ /// The bytes are not gauranteed to be a valid utf-8 string.
+ fn decode_bytes(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError>
{
let mut ret = Vec::with_capacity(tokens.len() * 2);
for &token in tokens {
let token_bytes = match self.decoder.get(&token) {
@@ -198,7 +216,7 @@
Ok(ret)
}
- fn _encode_ordinary_native(&self, text: &str) -> Vec<Rank> {
+ pub fn encode_ordinary(&self, text: &str) -> Vec<Rank> {
// This is the core of the encoding logic; the other functions in here
// just make things complicated :-)
let regex = self._get_tl_regex();
@@ -213,7 +231,7 @@
ret
}
- fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) ->
(Vec<Rank>, usize) {
+ pub fn encode(&self, text: &str, allowed_special: &HashSet<&str>) ->
(Vec<Rank>, usize) {
let special_regex = self._get_tl_special_regex();
let regex = self._get_tl_regex();
let mut ret = vec![];
@@ -308,12 +326,12 @@
(tokens, last_piece_token_len)
}
- fn _encode_unstable_native(
+ pub fn _encode_unstable_native(
&self,
text: &str,
allowed_special: &HashSet<&str>,
) -> (Vec<Rank>, HashSet<Vec<Rank>>) {
- let (tokens, last_piece_token_len) = self._encode_native(text,
allowed_special);
+ let (tokens, last_piece_token_len) = self.encode(text,
allowed_special);
if last_piece_token_len == 0 {
// If last_piece_token_len is zero, the last token was a special
token and we have
// no unstable bytes
@@ -323,7 +341,7 @@
self._increase_last_piece_token_len(tokens, last_piece_token_len);
let unstable_bytes = self
- ._decode_native(&tokens[tokens.len() - last_piece_token_len..])
+ .decode_bytes(&tokens[tokens.len() - last_piece_token_len..])
.unwrap();
tokens.truncate(tokens.len() - last_piece_token_len);
@@ -372,7 +390,7 @@
// So convert to UTF-8 and do regex splitting.
// E.g. with cl100k_base " !" gets split to " " + " !",
// but byte_pair_encode(" !") != byte_pair_encode(" ")
- Ok(s) => self._encode_ordinary_native(s),
+ Ok(s) => self.encode_ordinary(s),
// Technically, whether or not this arm is correct depends
on whether there
// would be a regex split before the UTF-8 truncation
point.
@@ -425,26 +443,37 @@
(tokens, completions)
}
-}
-#[pymethods]
-impl CoreBPE {
- #[new]
- fn new(
+ pub fn new<E, SE, NSE>(
+ encoder: E,
+ special_tokens_encoder: SE,
+ pattern: &str,
+ ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>>
+ where
+ E: IntoIterator<Item = (Vec<u8>, Rank)>,
+ SE: IntoIterator<Item = (String, Rank)>,
+ NSE: IntoIterator<Item = (String, (Rank, Rank))>,
+ {
+ Self::new_internal(
+ HashMap::from_iter(encoder),
+ HashMap::from_iter(special_tokens_encoder),
+ pattern,
+ )
+ }
+
+ fn new_internal(
encoder: HashMap<Vec<u8>, Rank>,
special_tokens_encoder: HashMap<String, Rank>,
pattern: &str,
- ) -> PyResult<Self> {
- let regex = Regex::new(pattern)
- .map_err(|e| PyErr::new::<exceptions::PyValueError,
_>(e.to_string()))?;
+ ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
+ let regex = Regex::new(pattern)?;
let special_regex = {
- let _parts = special_tokens_encoder
+ let parts = special_tokens_encoder
.keys()
.map(|s| fancy_regex::escape(s))
.collect::<Vec<_>>();
- Regex::new(&_parts.join("|"))
- .map_err(|e| PyErr::new::<exceptions::PyValueError,
_>(e.to_string()))?
+ Regex::new(&parts.join("|"))?
};
let decoder: HashMap<Rank, Vec<u8>> =
@@ -464,7 +493,7 @@
let mut sorted_token_bytes: Vec<Vec<u8>> =
encoder.keys().cloned().collect();
sorted_token_bytes.sort();
- Ok(CoreBPE {
+ Ok(Self {
encoder,
special_tokens_encoder,
decoder,
@@ -477,208 +506,22 @@
})
}
- // ====================
- // Encoding
- // ====================
-
- fn encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> {
- py.allow_threads(|| self._encode_ordinary_native(text))
- }
-
- fn encode(&self, py: Python, text: &str, allowed_special:
HashSet<PyBackedStr>) -> Vec<Rank> {
- py.allow_threads(|| {
- let allowed_special: HashSet<&str> =
- allowed_special.iter().map(|s| s.as_ref()).collect();
- self._encode_native(text, &allowed_special).0
- })
- }
-
- fn encode_to_tiktoken_buffer(
- &self,
- py: Python,
- text: &str,
- allowed_special: HashSet<PyBackedStr>,
- ) -> Py<PyAny> {
- let tokens = py.allow_threads(|| {
- let allowed_special: HashSet<&str> =
- allowed_special.iter().map(|s| s.as_ref()).collect();
- self._encode_native(text, &allowed_special).0
- });
- let buffer = TiktokenBuffer { tokens };
- buffer.into_py(py)
- }
-
- fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> {
- py.allow_threads(|| {
- match std::str::from_utf8(bytes) {
- Ok(text) => self._encode_ordinary_native(text),
- Err(e) => {
- let text = unsafe {
std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
- let (tokens, last_piece_token_len) =
self._encode_native(text, &HashSet::new());
- let (mut tokens, last_piece_token_len) =
- self._increase_last_piece_token_len(tokens,
last_piece_token_len);
- if !tokens.is_empty() && last_piece_token_len > 0 {
- // Lop off the tokens from the last piece and run BPE
on the remaining bytes
- // Somewhat niche, but this may not be correct if we'd
have had a regex
- // split between the valid UTF-8 and the invalid
bytes, which is why this
- // method is private
- let mut unstable_bytes = self
- ._decode_native(&tokens[tokens.len() -
last_piece_token_len..])
- .unwrap();
-
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
-
- tokens.truncate(tokens.len() - last_piece_token_len);
- match self.encoder.get(&unstable_bytes) {
- Some(token) => tokens.push(*token),
- None => {
-
tokens.extend(&byte_pair_encode(&unstable_bytes, &self.encoder))
- }
- }
- }
- tokens
- }
- }
- })
- }
-
- fn encode_with_unstable(
- &self,
- py: Python,
- text: &str,
- allowed_special: HashSet<PyBackedStr>,
- ) -> Py<PyTuple> {
- let (tokens, completions) = py.allow_threads(|| {
- let allowed_special: HashSet<&str> =
- allowed_special.iter().map(|s| s.as_ref()).collect();
- self._encode_unstable_native(text, &allowed_special)
- });
- let py_completions = PyList::new_bound(
- py,
- completions
- .iter()
- .map(|seq| PyList::new_bound(py, &seq[..])),
- );
- (tokens, py_completions).into_py(py)
- }
-
- fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> {
- if let Some(token) = self.encoder.get(piece).copied() {
- return Ok(token);
- }
- if let Ok(piece_str) = std::str::from_utf8(piece) {
- if let Some(token) =
self.special_tokens_encoder.get(piece_str).copied() {
- return Ok(token);
- }
- }
- Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned()))
- }
-
- fn encode_single_piece(&self, piece: &[u8]) -> Vec<Rank> {
- if let Some(token) = self.encoder.get(piece) {
- return vec![*token];
- }
- byte_pair_encode(piece, &self.encoder)
- }
-
- // ====================
- // Decoding
- // ====================
-
- fn decode_bytes(&self, py: Python, tokens: Vec<Rank>) ->
Result<Py<PyBytes>, PyErr> {
- match py.allow_threads(|| self._decode_native(&tokens)) {
- Ok(bytes) => Ok(PyBytes::new_bound(py, &bytes).into()),
- Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}",
e))),
- }
- }
-
- fn decode_single_token_bytes(&self, py: Python, token: Rank) ->
PyResult<Py<PyBytes>> {
- if let Some(bytes) = self.decoder.get(&token) {
- return Ok(PyBytes::new_bound(py, bytes).into());
- }
- if let Some(bytes) = self.special_tokens_decoder.get(&token) {
- return Ok(PyBytes::new_bound(py, bytes).into());
- }
- Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))
- }
-
- // ====================
- // Miscellaneous
- // ====================
-
- fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {
- self.sorted_token_bytes
- .iter()
- .map(|x| PyBytes::new_bound(py, x).into())
+ pub fn special_tokens(&self) -> HashSet<&str> {
+ self.special_tokens_encoder
+ .keys()
+ .map(|s| s.as_str())
.collect()
}
-}
-
-#[pyclass]
-struct TiktokenBuffer {
- tokens: Vec<Rank>,
-}
-
-#[pymethods]
-impl TiktokenBuffer {
- // Based on
https://github.com/PyO3/pyo3/blob/v0.22.2/tests/test_buffer_protocol.rs#L25
- unsafe fn __getbuffer__(
- slf: Bound<'_, Self>,
- view: *mut pyo3::ffi::Py_buffer,
- flags: std::os::raw::c_int,
- ) -> PyResult<()> {
- if view.is_null() {
- return Err(pyo3::exceptions::PyBufferError::new_err("View is
null"));
- }
- if (flags & pyo3::ffi::PyBUF_WRITABLE) == pyo3::ffi::PyBUF_WRITABLE {
- return Err(pyo3::exceptions::PyBufferError::new_err(
- "Object is not writable",
- ));
- }
-
- (*view).obj = slf.clone().into_any().into_ptr();
-
- let data = &slf.borrow().tokens;
- (*view).buf = data.as_ptr() as *mut std::os::raw::c_void;
- (*view).len = (data.len() * std::mem::size_of::<Rank>()) as isize;
- (*view).readonly = 1;
- (*view).itemsize = std::mem::size_of::<Rank>() as isize;
- (*view).format = if (flags & pyo3::ffi::PyBUF_FORMAT) ==
pyo3::ffi::PyBUF_FORMAT {
- let msg = std::ffi::CString::new("I").unwrap();
- msg.into_raw()
- } else {
- std::ptr::null_mut()
- };
- (*view).ndim = 1;
- (*view).shape = if (flags & pyo3::ffi::PyBUF_ND) ==
pyo3::ffi::PyBUF_ND {
- &mut (*view).len
- } else {
- std::ptr::null_mut()
- };
- (*view).strides = if (flags & pyo3::ffi::PyBUF_STRIDES) ==
pyo3::ffi::PyBUF_STRIDES {
- &mut (*view).itemsize
- } else {
- std::ptr::null_mut()
- };
- (*view).suboffsets = std::ptr::null_mut();
- (*view).internal = std::ptr::null_mut();
-
- Ok(())
- }
- unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) {
- std::mem::drop(std::ffi::CString::from_raw((*view).format));
+ pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {
+ let allowed_special = self.special_tokens();
+ self.encode(text, &allowed_special).0
}
}
-#[pymodule]
-fn _tiktoken(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
- m.add_class::<CoreBPE>()?;
- Ok(())
-}
-
#[cfg(test)]
mod tests {
-
+ use fancy_regex::Regex;
use rustc_hash::FxHashMap as HashMap;
use crate::{byte_pair_split, Rank};
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/src/py.rs new/tiktoken-0.9.0/src/py.rs
--- old/tiktoken-0.8.0/src/py.rs 1970-01-01 01:00:00.000000000 +0100
+++ new/tiktoken-0.9.0/src/py.rs 2025-02-14 06:53:03.000000000 +0100
@@ -0,0 +1,236 @@
+use std::collections::HashSet;
+
+use pyo3::{
+ exceptions,
+ prelude::*,
+ pybacked::PyBackedStr,
+ types::{PyBytes, PyList, PyTuple},
+ PyResult,
+};
+use rustc_hash::FxHashMap as HashMap;
+
+use crate::{byte_pair_encode, CoreBPE, Rank};
+
+#[pymethods]
+impl CoreBPE {
+ #[new]
+ fn py_new(
+ encoder: HashMap<Vec<u8>, Rank>,
+ special_tokens_encoder: HashMap<String, Rank>,
+ pattern: &str,
+ ) -> PyResult<Self> {
+ Self::new_internal(
+ encoder,
+ special_tokens_encoder,
+ pattern,
+ )
+ .map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))
+ }
+
+ // ====================
+ // Encoding
+ // ====================
+
+ #[pyo3(name = "encode_ordinary")]
+ fn py_encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> {
+ py.allow_threads(|| self.encode_ordinary(text))
+ }
+
+ #[pyo3(name = "encode")]
+ fn py_encode(
+ &self,
+ py: Python,
+ text: &str,
+ allowed_special: HashSet<PyBackedStr>,
+ ) -> Vec<Rank> {
+ py.allow_threads(|| {
+ let allowed_special: HashSet<&str> =
+ allowed_special.iter().map(|s| s.as_ref()).collect();
+ self.encode(text, &allowed_special).0
+ })
+ }
+
+ fn encode_to_tiktoken_buffer(
+ &self,
+ py: Python,
+ text: &str,
+ allowed_special: HashSet<PyBackedStr>,
+ ) -> Py<PyAny> {
+ let tokens = py.allow_threads(|| {
+ let allowed_special: HashSet<&str> =
+ allowed_special.iter().map(|s| s.as_ref()).collect();
+ self.encode(text, &allowed_special).0
+ });
+ let buffer = TiktokenBuffer { tokens };
+ buffer.into_py(py)
+ }
+
+ fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> {
+ py.allow_threads(|| {
+ match std::str::from_utf8(bytes) {
+ Ok(text) => self.encode_ordinary(text),
+ Err(e) => {
+ let text = unsafe {
std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
+ let (tokens, last_piece_token_len) = self.encode(text,
&HashSet::new());
+ let (mut tokens, last_piece_token_len) =
+ self._increase_last_piece_token_len(tokens,
last_piece_token_len);
+ if !tokens.is_empty() && last_piece_token_len > 0 {
+ // Lop off the tokens from the last piece and run BPE
on the remaining bytes
+ // Somewhat niche, but this may not be correct if we'd
have had a regex
+ // split between the valid UTF-8 and the invalid
bytes, which is why this
+ // method is private
+ let mut unstable_bytes = self
+ .decode_bytes(&tokens[tokens.len() -
last_piece_token_len..])
+ .unwrap();
+
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
+
+ tokens.truncate(tokens.len() - last_piece_token_len);
+ match self.encoder.get(&unstable_bytes) {
+ Some(token) => tokens.push(*token),
+ None => {
+
tokens.extend(&byte_pair_encode(&unstable_bytes, &self.encoder))
+ }
+ }
+ }
+ tokens
+ }
+ }
+ })
+ }
+
+ #[pyo3(name = "encode_with_unstable")]
+ fn py_encode_with_unstable(
+ &self,
+ py: Python,
+ text: &str,
+ allowed_special: HashSet<PyBackedStr>,
+ ) -> Py<PyTuple> {
+ let (tokens, completions) = py.allow_threads(|| {
+ let allowed_special: HashSet<&str> =
+ allowed_special.iter().map(|s| s.as_ref()).collect();
+ self._encode_unstable_native(text, &allowed_special)
+ });
+ let py_completions = PyList::new_bound(
+ py,
+ completions
+ .iter()
+ .map(|seq| PyList::new_bound(py, &seq[..])),
+ );
+ (tokens, py_completions).into_py(py)
+ }
+
+ fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> {
+ if let Some(token) = self.encoder.get(piece).copied() {
+ return Ok(token);
+ }
+ if let Ok(piece_str) = std::str::from_utf8(piece) {
+ if let Some(token) =
self.special_tokens_encoder.get(piece_str).copied() {
+ return Ok(token);
+ }
+ }
+ Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned()))
+ }
+
+ fn encode_single_piece(&self, piece: &[u8]) -> Vec<Rank> {
+ if let Some(token) = self.encoder.get(piece) {
+ return vec![*token];
+ }
+ byte_pair_encode(piece, &self.encoder)
+ }
+
+ // ====================
+ // Decoding
+ // ====================
+
+ #[pyo3(name = "decode_bytes")]
+ fn py_decode_bytes(&self, py: Python, tokens: Vec<Rank>) ->
Result<Py<PyBytes>, PyErr> {
+ match py.allow_threads(|| self.decode_bytes(&tokens)) {
+ Ok(bytes) => Ok(PyBytes::new_bound(py, &bytes).into()),
+ Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}",
e))),
+ }
+ }
+
+ fn decode_single_token_bytes(&self, py: Python, token: Rank) ->
PyResult<Py<PyBytes>> {
+ if let Some(bytes) = self.decoder.get(&token) {
+ return Ok(PyBytes::new_bound(py, bytes).into());
+ }
+ if let Some(bytes) = self.special_tokens_decoder.get(&token) {
+ return Ok(PyBytes::new_bound(py, bytes).into());
+ }
+ Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))
+ }
+
+ // ====================
+ // Miscellaneous
+ // ====================
+
+ fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {
+ self.sorted_token_bytes
+ .iter()
+ .map(|x| PyBytes::new_bound(py, x).into())
+ .collect()
+ }
+}
+
+#[pyclass]
+struct TiktokenBuffer {
+ tokens: Vec<Rank>,
+}
+
+#[pymethods]
+impl TiktokenBuffer {
+ // Based on
https://github.com/PyO3/pyo3/blob/v0.22.2/tests/test_buffer_protocol.rs#L25
+ unsafe fn __getbuffer__(
+ slf: Bound<'_, Self>,
+ view: *mut pyo3::ffi::Py_buffer,
+ flags: std::os::raw::c_int,
+ ) -> PyResult<()> {
+ if view.is_null() {
+ return Err(pyo3::exceptions::PyBufferError::new_err("View is
null"));
+ }
+ if (flags & pyo3::ffi::PyBUF_WRITABLE) == pyo3::ffi::PyBUF_WRITABLE {
+ return Err(pyo3::exceptions::PyBufferError::new_err(
+ "Object is not writable",
+ ));
+ }
+
+ (*view).obj = slf.clone().into_any().into_ptr();
+
+ let data = &slf.borrow().tokens;
+ (*view).buf = data.as_ptr() as *mut std::os::raw::c_void;
+ (*view).len = (data.len() * std::mem::size_of::<Rank>()) as isize;
+ (*view).readonly = 1;
+ (*view).itemsize = std::mem::size_of::<Rank>() as isize;
+ (*view).format = if (flags & pyo3::ffi::PyBUF_FORMAT) ==
pyo3::ffi::PyBUF_FORMAT {
+ let msg = std::ffi::CString::new("I").unwrap();
+ msg.into_raw()
+ } else {
+ std::ptr::null_mut()
+ };
+ (*view).ndim = 1;
+ (*view).shape = if (flags & pyo3::ffi::PyBUF_ND) ==
pyo3::ffi::PyBUF_ND {
+ &mut (*view).len
+ } else {
+ std::ptr::null_mut()
+ };
+ (*view).strides = if (flags & pyo3::ffi::PyBUF_STRIDES) ==
pyo3::ffi::PyBUF_STRIDES {
+ &mut (*view).itemsize
+ } else {
+ std::ptr::null_mut()
+ };
+ (*view).suboffsets = std::ptr::null_mut();
+ (*view).internal = std::ptr::null_mut();
+
+ Ok(())
+ }
+
+ unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) {
+ std::mem::drop(std::ffi::CString::from_raw((*view).format));
+ }
+}
+
+#[pymodule]
+fn _tiktoken(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
+ m.add_class::<CoreBPE>()?;
+ Ok(())
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/tests/test_pickle.py
new/tiktoken-0.9.0/tests/test_pickle.py
--- old/tiktoken-0.8.0/tests/test_pickle.py 1970-01-01 01:00:00.000000000
+0100
+++ new/tiktoken-0.9.0/tests/test_pickle.py 2025-02-14 06:53:03.000000000
+0100
@@ -0,0 +1,23 @@
+import tiktoken
+
+
+def test_pickle():
+ import pickle
+
+ enc_old = tiktoken.get_encoding("r50k_base")
+ enc_new = pickle.loads(pickle.dumps(enc_old))
+ assert enc_old.encode("hello world") == enc_new.encode("hello world")
+
+ enc_old = tiktoken.Encoding(
+ name="custom_enc",
+ pat_str=enc_old._pat_str,
+ mergeable_ranks=enc_old._mergeable_ranks,
+ special_tokens={"<|pickle|>": 100_000},
+ )
+ enc_new = pickle.loads(pickle.dumps(enc_old))
+ assert enc_old.encode("hello world") == enc_new.encode("hello world")
+ assert (
+ enc_old.encode("<|pickle|>", allowed_special="all")
+ == enc_new.encode("<|pickle|>", allowed_special="all")
+ == [100_000]
+ )
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/__init__.py
new/tiktoken-0.9.0/tiktoken/__init__.py
--- old/tiktoken-0.8.0/tiktoken/__init__.py 2024-10-03 23:15:34.000000000
+0200
+++ new/tiktoken-0.9.0/tiktoken/__init__.py 2025-02-14 06:53:03.000000000
+0100
@@ -5,4 +5,4 @@
from .registry import get_encoding as get_encoding
from .registry import list_encoding_names as list_encoding_names
-__version__ = "0.8.0"
+__version__ = "0.9.0"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/core.py
new/tiktoken-0.9.0/tiktoken/core.py
--- old/tiktoken-0.8.0/tiktoken/core.py 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/tiktoken/core.py 2025-02-14 06:53:03.000000000 +0100
@@ -2,12 +2,16 @@
import functools
from concurrent.futures import ThreadPoolExecutor
-from typing import AbstractSet, Collection, Literal, NoReturn, Sequence
+from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn,
Sequence
import regex
from tiktoken import _tiktoken
+if TYPE_CHECKING:
+ import numpy as np
+ import numpy.typing as npt
+
class Encoding:
def __init__(
@@ -128,6 +132,32 @@
text = text.encode("utf-16", "surrogatepass").decode("utf-16",
"replace")
return self._core_bpe.encode(text, allowed_special)
+ def encode_to_numpy(
+ self,
+ text: str,
+ *,
+ allowed_special: Literal["all"] | AbstractSet[str] = set(), # noqa:
B006
+ disallowed_special: Literal["all"] | Collection[str] = "all",
+ ) -> npt.NDArray[np.uint32]:
+ """Encodes a string into tokens, returning a numpy array.
+
+ Avoids the overhead of copying the token buffer into a Python list.
+ """
+ if allowed_special == "all":
+ allowed_special = self.special_tokens_set
+ if disallowed_special == "all":
+ disallowed_special = self.special_tokens_set - allowed_special
+ if disallowed_special:
+ if not isinstance(disallowed_special, frozenset):
+ disallowed_special = frozenset(disallowed_special)
+ if match := _special_token_regex(disallowed_special).search(text):
+ raise_disallowed_special_token(match.group())
+
+ import numpy as np
+
+ buffer = self._core_bpe.encode_to_tiktoken_buffer(text,
self.special_tokens_set)
+ return np.frombuffer(buffer, dtype=np.uint32)
+
def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8)
-> list[list[int]]:
"""Encodes a list of strings into tokens, in parallel, ignoring
special tokens.
@@ -332,6 +362,10 @@
def special_tokens_set(self) -> set[str]:
return set(self._special_tokens.keys())
+ def is_special_token(self, token: int) -> bool:
+ assert isinstance(token, int)
+ return token in self._special_token_values
+
@property
def n_vocab(self) -> int:
"""For backwards compatibility. Prefer to use `enc.max_token_value +
1`."""
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/load.py
new/tiktoken-0.9.0/tiktoken/load.py
--- old/tiktoken-0.8.0/tiktoken/load.py 2024-10-03 23:15:34.000000000 +0200
+++ new/tiktoken-0.9.0/tiktoken/load.py 2025-02-14 06:53:03.000000000 +0100
@@ -2,12 +2,7 @@
import base64
import hashlib
-import json
import os
-import tempfile
-import uuid
-
-import requests
def read_file(blobpath: str) -> bytes:
@@ -20,7 +15,10 @@
) from e
with blobfile.BlobFile(blobpath, "rb") as f:
return f.read()
+
# avoiding blobfile for public files helps avoid auth issues, like MFA
prompts
+ import requests
+
resp = requests.get(blobpath)
resp.raise_for_status()
return resp.content
@@ -38,6 +36,8 @@
elif "DATA_GYM_CACHE_DIR" in os.environ:
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
else:
+ import tempfile
+
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
user_specified_cache = False
@@ -67,6 +67,8 @@
f"This may indicate a corrupted download. Please try again."
)
+ import uuid
+
try:
os.makedirs(cache_dir, exist_ok=True)
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
@@ -114,6 +116,8 @@
bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n
n += 1
+ import json
+
# check that the encoder file matches the merges file
# this sanity check is important since tiktoken assumes that ranks are
ordered the same
# as merge priority
@@ -142,7 +146,13 @@
def load_tiktoken_bpe(tiktoken_bpe_file: str, expected_hash: str | None =
None) -> dict[bytes, int]:
# NB: do not add caching to this function
contents = read_file_cached(tiktoken_bpe_file, expected_hash)
- return {
- base64.b64decode(token): int(rank)
- for token, rank in (line.split() for line in contents.splitlines() if
line)
- }
+ ret = {}
+ for line in contents.splitlines():
+ if not line:
+ continue
+ try:
+ token, rank = line.split()
+ ret[base64.b64decode(token)] = int(rank)
+ except Exception as e:
+ raise ValueError(f"Error parsing line {line!r} in
{tiktoken_bpe_file}") from e
+ return ret
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/tiktoken-0.8.0/tiktoken/model.py
new/tiktoken-0.9.0/tiktoken/model.py
--- old/tiktoken-0.8.0/tiktoken/model.py 2024-10-03 23:15:34.000000000
+0200
+++ new/tiktoken-0.9.0/tiktoken/model.py 2025-02-14 06:53:03.000000000
+0100
@@ -6,6 +6,7 @@
# TODO: these will likely be replaced by an API endpoint
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
"o1-": "o200k_base",
+ "o3-": "o200k_base",
# chat
"chatgpt-4o-": "o200k_base",
"gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13
@@ -13,6 +14,7 @@
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
# fine-tuned
+ "ft:gpt-4o": "o200k_base",
"ft:gpt-4": "cl100k_base",
"ft:gpt-3.5-turbo": "cl100k_base",
"ft:davinci-002": "cl100k_base",
@@ -20,6 +22,9 @@
}
MODEL_TO_ENCODING: dict[str, str] = {
+ # reasoning
+ "o1": "o200k_base",
+ "o3": "o200k_base",
# chat
"gpt-4o": "o200k_base",
"gpt-4": "cl100k_base",
++++++ vendor.tar.zst ++++++
++++ 564379 lines of diff (skipped)