Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 0 additions & 51 deletions .github/workflows/jekyll-gh-pages.yml

This file was deleted.

38 changes: 27 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ for text in predacons.text_stream(model_path="path/to/your/model",
print(text)

# Get text streamer
thread,streamer = predacons.text_generate(model=model, tokenizer = tokenizer, sequence = seq, max_length=100, temperature=0.1,stream=True)
thread,streamer = predacons.text_generate(model=model, tokenizer=tokenizer, sequence=seq, max_length=100, temperature=0.1, stream=True)

# You can also use a processor instead of a tokenizer for model-based generation:
thread,streamer = predacons.text_generate(model=model, processor=processor, sequence=seq, max_length=100, temperature=0.1, stream=True)

thread.start()
try:
Expand All @@ -74,18 +77,31 @@ finally:
chat = [
{"role": "user", "content": "Hey, what is a car?"}
]
chat_output = predacons.chat_generate(model = model,
sequence = chat,
max_length = 50,
tokenizer = tokenizers,
trust_remote_code = True)
chat_output = predacons.chat_generate(model=model,
sequence=chat,
max_length=50,
tokenizer=tokenizers,
trust_remote_code=True)
# You can also use a processor instead of a tokenizer for chat generation:
chat_output = predacons.chat_generate(model=model,
sequence=chat,
max_length=50,
processor=processor,
trust_remote_code=True)

# Stream chat generation using a trained model
for chat in predacons.chat_stream(model = model,
sequence = chat,
max_length = 50,
tokenizer = tokenizers,
trust_remote_code = True):
for chat in predacons.chat_stream(model=model,
sequence=chat,
max_length=50,
tokenizer=tokenizers,
trust_remote_code=True):
print(chat)
# You can also use a processor instead of a tokenizer for chat streaming:
for chat in predacons.chat_stream(model=model,
sequence=chat,
max_length=50,
processor=processor,
trust_remote_code=True):
print(chat)

# get chat streamer
Expand Down
1 change: 1 addition & 0 deletions app/predacons/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
generate_text_data_source_llm,
load_model,
load_tokenizer,
load_processor,
generate,
text_generate,
chat_generate,
Expand Down
47 changes: 46 additions & 1 deletion app/predacons/src/generate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from transformers import AutoModelForPreTraining, AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,TextIteratorStreamer,GenerationConfig
from transformers import AutoModelForPreTraining, AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,TextIteratorStreamer,GenerationConfig,AutoProcessor
import torch
from threading import Thread

Expand Down Expand Up @@ -48,6 +48,9 @@ def __load_tokenizer(tokenizer_path,gguf_file=None):
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,gguf_file=gguf_file)
return tokenizer

def __load_processor(tokenizer_path,use_fast=False,gguf_file=None):
processor = AutoProcessor.from_pretrained(tokenizer_path, use_fast=use_fast, gguf_file=gguf_file)
return processor
def __generate_output(model_path, sequence, max_length,trust_remote_code=False,gguf_file=None,auto_quantize=None):
model = Generate.__load_model(model_path,trust_remote_code=trust_remote_code,gguf_file=gguf_file,auto_quantize=auto_quantize)
tokenizer = Generate.__load_tokenizer(model_path,gguf_file=gguf_file)
Expand Down Expand Up @@ -190,7 +193,41 @@ def __generate_chat_output_from_model_stream(model, tokenizer, sequence, max_len
return thread, streamer
except Exception as e:
raise RuntimeError(f"Failed to setup streaming generation: {str(e)}")

def __generate_output_with_processor(model, processor, messages, max_length, temperature=0.1):
try:
if processor.chat_template is None:
print("Warning: Chat template not found in processor. Applying default chat template")
processor.chat_template = Generate.default_chat_template
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True,
return_dict=True, return_tensors="pt"
).to(model.device, dtype=torch.bfloat16)
with torch.inference_mode():
final_outputs = model.generate(**inputs, max_new_tokens=max_length, do_sample=False, temperature=temperature)
return inputs, final_outputs, processor
except Exception as e:
raise RuntimeError(f"Failed to generate output with processor: {str(e)}")

def __generate_output_with_processor_stream(model, processor, messages, max_length, temperature=0.1):
try:
if processor.chat_template is None:
print("Warning: Chat template not found in processor. Applying default chat template")
processor.chat_template = Generate.default_chat_template
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True,
return_dict=True, return_tensors="pt"
).to(model.device, dtype=torch.bfloat16)
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_config = GenerationConfig(
temperature=temperature,
do_sample=True,
)
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_length, generation_config=generation_config)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
return thread, streamer
except Exception as e:
raise RuntimeError(f"Failed to generate output with processor stream: {str(e)}")
def generate_output(model_path, sequence, max_length,trust_remote_code=False,gguf_file=None,auto_quantize=None):
return Generate.__generate_output(model_path, sequence, max_length,trust_remote_code=trust_remote_code,gguf_file=gguf_file,auto_quantize=auto_quantize)

Expand All @@ -203,6 +240,9 @@ def generate_text(model_path, sequence, max_length,trust_remote_code=False,gguf_
def load_tokenizer(tokenizer_path,gguf_file=None):
return Generate.__load_tokenizer(tokenizer_path,gguf_file=gguf_file)

def load_processor(tokenizer_path,use_fast=False,gguf_file=None):
return Generate.__load_processor(tokenizer_path,use_fast=use_fast,gguf_file=gguf_file)

def load_model(model_path,trust_remote_code=False,gguf_file = None,auto_quantize=None):
return Generate.__load_model(model_path,trust_remote_code=trust_remote_code,gguf_file=gguf_file,auto_quantize=auto_quantize)

Expand All @@ -224,3 +264,8 @@ def generate_chat_output_from_model(model, tokenizer, sequence, max_length,tempe
def generate_chat_output_from_model_stream(model, tokenizer, sequence, max_length,temperature=0.1,trust_remote_code=False):
return Generate.__generate_chat_output_from_model_stream(model, tokenizer, sequence, max_length,temperature=temperature,trust_remote_code=trust_remote_code)

def generate_output_with_processor(model, processor, messages, max_length, temperature=0.1):
return Generate.__generate_output_with_processor(model, processor, messages, max_length, temperature)

def generate_output_with_processor_stream(model, processor, messages, max_length, temperature=0.1):
return Generate.__generate_output_with_processor_stream(model, processor, messages, max_length, temperature)
Loading