-
Notifications
You must be signed in to change notification settings - Fork 19
Add decode stream #64
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
f74228e
b7bb0f6
7f00aa5
f11d515
8041e12
c4f45f8
d2a91ce
7ba0b5d
f22a2ef
2677724
6d718ac
703f0e7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,57 @@ | ||||||||||||
| defmodule Tokenizers.DecodeStream do | ||||||||||||
| @moduledoc """ | ||||||||||||
| Implements streaming decoding functionality for tokenizers. | ||||||||||||
| """ | ||||||||||||
|
|
||||||||||||
| @enforce_keys [:resource] | ||||||||||||
| defstruct [:resource] | ||||||||||||
|
|
||||||||||||
| @type t :: %__MODULE__{ | ||||||||||||
| resource: reference() | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| @doc """ | ||||||||||||
| Creates a new decode stream. | ||||||||||||
|
|
||||||||||||
| The `skip_special_tokens` option determines whether special tokens should be skipped during decoding. | ||||||||||||
| By default, it is set to `false`. | ||||||||||||
| """ | ||||||||||||
| @spec new(boolean()) :: t() | ||||||||||||
| def new(skip_special_tokens \\ false) do | ||||||||||||
| Tokenizers.Native.decoder_stream_new(skip_special_tokens) | ||||||||||||
| end | ||||||||||||
|
|
||||||||||||
| @doc """ | ||||||||||||
| Steps through the decode stream with the given tokenizer and token ID. | ||||||||||||
|
|
||||||||||||
| Returns `{:ok, string}` if there's a decoded string, or `{:ok, nil}` if there's nothing more to decode. | ||||||||||||
|
||||||||||||
| Returns `{:error, reason}` if an error occurs during decoding. | ||||||||||||
| """ | ||||||||||||
| def step(%__MODULE__{} = decode_stream, tokenizer, id) when is_integer(id) do | ||||||||||||
| case Tokenizers.Native.decoder_stream_step(decode_stream, tokenizer, id) do | ||||||||||||
| {:ok, result} -> {:ok, result} | ||||||||||||
| {:error, reason} -> {:error, reason} | ||||||||||||
| end | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it actually efficient going step after step by giving specific indexes? Or does the upstream code works better by calling something
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this PR I translated API as is. Some abstraction can be definitely written on top of it using Decoding large chunks this way looks better for BEAM because control is given back from NIF. There is a possibility to write decoding without dirty, but i'm not sure somebody will invest into it.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it! Thank you! |
||||||||||||
| end | ||||||||||||
|
|
||||||||||||
| @doc """ | ||||||||||||
| Returns information about the decode stream state. | ||||||||||||
| """ | ||||||||||||
| def info(%__MODULE__{} = decode_stream) do | ||||||||||||
| Tokenizers.Native.decoder_stream_info(decode_stream) | ||||||||||||
Virviil marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||||||
| end | ||||||||||||
|
|
||||||||||||
| defimpl Inspect do | ||||||||||||
| def inspect(decode_stream, _opts) do | ||||||||||||
| info = Tokenizers.DecodeStream.info(decode_stream) | ||||||||||||
| "#Tokenizers.DecodeStream<#{inspect(info)}>" | ||||||||||||
Virviil marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||||||
| end | ||||||||||||
| end | ||||||||||||
|
|
||||||||||||
| defimpl String.Chars do | ||||||||||||
| def to_string(decode_stream) do | ||||||||||||
| info = Tokenizers.DecodeStream.info(decode_stream) | ||||||||||||
| "#Tokenizers.DecodeStream<#{inspect(info)}>" | ||||||||||||
| end | ||||||||||||
| end | ||||||||||||
Virviil marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
| end | ||||||||||||
Uh oh!
There was an error while loading. Please reload this page.