diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml
deleted file mode 100644
index 559bddf..0000000
--- a/.github/workflows/jekyll-gh-pages.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Sample workflow for building and deploying a Jekyll site to GitHub Pages
-name: Deploy Jekyll with GitHub Pages dependencies preinstalled
-
-on:
-  # Runs on pushes targeting the default branch
-  push:
-    branches: ["main"]
-
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
-
-# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
-permissions:
-  contents: read
-  pages: write
-  id-token: write
-
-# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
-# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
-concurrency:
-  group: "pages"
-  cancel-in-progress: false
-
-jobs:
-  # Build job
-  build:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-      - name: Setup Pages
-        uses: actions/configure-pages@v3
-      - name: Build with Jekyll
-        uses: actions/jekyll-build-pages@v1
-        with:
-          source: ./
-          destination: ./_site
-      - name: Upload artifact
-        uses: actions/upload-pages-artifact@v2
-
-  # Deployment job
-  deploy:
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-    runs-on: ubuntu-latest
-    needs: build
-    steps:
-      - name: Deploy to GitHub Pages
-        id: deployment
-        uses: actions/deploy-pages@v2
diff --git a/README.md b/README.md
index 706cf63..173a724 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,10 @@ for text in predacons.text_stream(model_path="path/to/your/model",
     print(text)
 
 # Get text streamer
-thread,streamer = predacons.text_generate(model=model, tokenizer = tokenizer, sequence = seq, max_length=100, temperature=0.1,stream=True)
+thread,streamer = predacons.text_generate(model=model, tokenizer=tokenizer, sequence=seq, max_length=100, temperature=0.1, stream=True)
+
+# You can also use a processor instead of a tokenizer for model-based generation:
+thread,streamer = predacons.text_generate(model=model, processor=processor, sequence=seq, max_length=100, temperature=0.1, stream=True)
 
 thread.start()
 try:
@@ -74,18 +77,31 @@ finally:
 chat = [
     {"role": "user", "content": "Hey, what is a car?"}
 ]
-chat_output = predacons.chat_generate(model = model,
-        sequence = chat,
-        max_length = 50,
-        tokenizer = tokenizers,
-        trust_remote_code = True)
+chat_output = predacons.chat_generate(model=model,
+        sequence=chat,
+        max_length=50,
+        tokenizer=tokenizers,
+        trust_remote_code=True)
+# You can also use a processor instead of a tokenizer for chat generation:
+chat_output = predacons.chat_generate(model=model,
+        sequence=chat,
+        max_length=50,
+        processor=processor,
+        trust_remote_code=True)
 
 # Stream chat generation using a trained model
-for chat in predacons.chat_stream(model = model,
-                                  sequence = chat,
-                                  max_length = 50,
-                                  tokenizer = tokenizers,
-                                  trust_remote_code = True):
+for chat in predacons.chat_stream(model=model,
+                                  sequence=chat,
+                                  max_length=50,
+                                  tokenizer=tokenizers,
+                                  trust_remote_code=True):
+    print(chat)
+# You can also use a processor instead of a tokenizer for chat streaming:
+for chat in predacons.chat_stream(model=model,
+                                  sequence=chat,
+                                  max_length=50,
+                                  processor=processor,
+                                  trust_remote_code=True):
     print(chat)
 
 # get chat streamer
diff --git a/app/predacons/__init__.py b/app/predacons/__init__.py
index 1db39f9..5b32d1e 100644
--- a/app/predacons/__init__.py
+++ b/app/predacons/__init__.py
@@ -15,6 +15,7 @@
     generate_text_data_source_llm,
     load_model,
     load_tokenizer,
+    load_processor,
     generate,
     text_generate,
     chat_generate,
diff --git a/app/predacons/src/generate.py b/app/predacons/src/generate.py
index 4b916fb..d5ddf27 100644
--- a/app/predacons/src/generate.py
+++ b/app/predacons/src/generate.py
@@ -1,4 +1,4 @@
-from transformers import AutoModelForPreTraining, AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,TextIteratorStreamer,GenerationConfig
+from transformers import AutoModelForPreTraining, AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,TextIteratorStreamer,GenerationConfig,AutoProcessor
 import torch
 from threading import Thread
 
@@ -48,6 +48,9 @@ def __load_tokenizer(tokenizer_path,gguf_file=None):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,gguf_file=gguf_file)
         return tokenizer
     
+    def __load_processor(tokenizer_path,use_fast=False,gguf_file=None):
+        processor = AutoProcessor.from_pretrained(tokenizer_path, use_fast=use_fast, gguf_file=gguf_file)
+        return processor
     def __generate_output(model_path, sequence, max_length,trust_remote_code=False,gguf_file=None,auto_quantize=None):
         model = Generate.__load_model(model_path,trust_remote_code=trust_remote_code,gguf_file=gguf_file,auto_quantize=auto_quantize)
         tokenizer = Generate.__load_tokenizer(model_path,gguf_file=gguf_file)
@@ -190,7 +193,41 @@ def __generate_chat_output_from_model_stream(model, tokenizer, sequence, max_len
             return thread, streamer
         except Exception as e:
             raise RuntimeError(f"Failed to setup streaming generation: {str(e)}")
+        
+    def __generate_output_with_processor(model, processor, messages, max_length, temperature=0.1):
+        try:
+            if processor.chat_template is None:
+                print("Warning: Chat template not found in processor. Applying default chat template")
+                processor.chat_template = Generate.default_chat_template
+            inputs = processor.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True,
+                return_dict=True, return_tensors="pt"
+            ).to(model.device, dtype=torch.bfloat16)
+            with torch.inference_mode():
+                final_outputs = model.generate(**inputs, max_new_tokens=max_length, do_sample=False, temperature=temperature)
+            return inputs, final_outputs, processor
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate output with processor: {str(e)}")
 
+    def __generate_output_with_processor_stream(model, processor, messages, max_length, temperature=0.1):
+        try:
+            if processor.chat_template is None:
+                print("Warning: Chat template not found in processor. Applying default chat template")
+                processor.chat_template = Generate.default_chat_template
+            inputs = processor.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True,
+                return_dict=True, return_tensors="pt"
+            ).to(model.device, dtype=torch.bfloat16)
+            streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+            generation_config = GenerationConfig(
+                temperature=temperature,
+                do_sample=True,
+            )
+            generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_length, generation_config=generation_config)
+            thread = Thread(target=model.generate, kwargs=generation_kwargs)
+            return thread, streamer
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate output with processor stream: {str(e)}")
     def generate_output(model_path, sequence, max_length,trust_remote_code=False,gguf_file=None,auto_quantize=None):
         return Generate.__generate_output(model_path, sequence, max_length,trust_remote_code=trust_remote_code,gguf_file=gguf_file,auto_quantize=auto_quantize)
     
@@ -203,6 +240,9 @@ def generate_text(model_path, sequence, max_length,trust_remote_code=False,gguf_
     def load_tokenizer(tokenizer_path,gguf_file=None):
         return Generate.__load_tokenizer(tokenizer_path,gguf_file=gguf_file)
     
+    def load_processor(tokenizer_path,use_fast=False,gguf_file=None):
+        return Generate.__load_processor(tokenizer_path,use_fast=use_fast,gguf_file=gguf_file)
+    
     def load_model(model_path,trust_remote_code=False,gguf_file = None,auto_quantize=None):
         return Generate.__load_model(model_path,trust_remote_code=trust_remote_code,gguf_file=gguf_file,auto_quantize=auto_quantize)
     
@@ -224,3 +264,8 @@ def generate_chat_output_from_model(model, tokenizer, sequence, max_length,tempe
     def generate_chat_output_from_model_stream(model, tokenizer, sequence, max_length,temperature=0.1,trust_remote_code=False):
         return Generate.__generate_chat_output_from_model_stream(model, tokenizer, sequence, max_length,temperature=temperature,trust_remote_code=trust_remote_code)
     
+    def generate_output_with_processor(model, processor, messages, max_length, temperature=0.1):
+        return Generate.__generate_output_with_processor(model, processor, messages, max_length, temperature)
+
+    def generate_output_with_processor_stream(model, processor, messages, max_length, temperature=0.1):
+        return Generate.__generate_output_with_processor_stream(model, processor, messages, max_length, temperature)
\ No newline at end of file
diff --git a/app/predacons/src/predacons.py b/app/predacons/src/predacons.py
index fcd2a98..9f09114 100644
--- a/app/predacons/src/predacons.py
+++ b/app/predacons/src/predacons.py
@@ -77,6 +77,7 @@ def rollout():
     print("    draft_model_name -- Draft model name / path (default None)")
     print("    model -- give a preloaded Model (default None)")
     print("    tokenizer -- give a preloaded Tokenizer (default None)")
+    print("    processor -- give a preloaded Processor (default None), alternative to tokenizer for model-based generation")
     print("\ntext_generate -- Generate text and print")
     print("    model_path -- Model path")
     print("    sequence -- Sequence")
@@ -86,6 +87,7 @@ def rollout():
     print("    draft_model_name -- Draft model name / path (default None)")
     print("    model -- give a preloaded Model (default None)")
     print("    tokenizer -- give a preloaded Tokenizer (default None)")
+    print("    processor -- give a preloaded Processor (default None), alternative to tokenizer for model-based generation")
     print("\ntext_stream -- stream text and print")
     print("    model_path -- Model path")
     print("    sequence -- Sequence")
@@ -391,19 +393,20 @@ def generate(*args, **kwargs):
         draft_model_name (str, optional): The name of the draft model. Defaults to None.
         model (object): The model object.
         tokenizer (object): The tokenizer object.
+        processor (object): The processor object. Alternative to tokenizer for model-based generation. If provided, will be used for generation instead of tokenizer.
         apply_chat_template (bool, optional): Whether to apply the chat template. Defaults to False.
         temperature (float, optional): The temperature parameter for controlling the randomness of the generated output. Defaults to 0.1.
         gguf_file (str, optional): The path to the GGUF file. Defaults to None.
         auto_quantize (str, optional): Automatically apply quantization. Accepts "4bit"/"high" for high compression or "8bit"/"low" for lower compression. Defaults to None.
-        stream (bool, optional): Whether to stream the output. Defaults to False. if True, thread and streamer will be returned.
+        stream (bool, optional): Whether to stream the output. Defaults to False. If True, thread and streamer will be returned.
 
     Returns:
-        str: The generated output.
+        str or tuple: The generated output, or (thread, streamer) if streaming is enabled.
 
     Raises:
         ValueError: If the arguments are invalid.
     """
-    if 'model_path' in kwargs and ('sequence' or 'chat') in kwargs:
+    if 'model_path' in kwargs and ('sequence' in kwargs or 'chat' in kwargs):
         model_path = kwargs['model_path']
         sequence = kwargs['sequence']
         max_length = kwargs.get('max_length', 50)
@@ -435,33 +438,51 @@ def generate(*args, **kwargs):
                 return Generate.generate_output_stream(model_path, sequence, max_length,trust_remote_code = trust_remote_code,gguf_file = gguf_file,auto_quantize=auto_quantize)
             return Generate.generate_output(model_path, sequence, max_length,trust_remote_code = trust_remote_code,gguf_file = gguf_file,auto_quantize=auto_quantize) 
     
-    elif 'model' in kwargs and 'tokenizer' in kwargs and 'sequence' in kwargs:
+    elif 'model' in kwargs and 'sequence' in kwargs:
         model = kwargs['model']
-        tokenizer = kwargs['tokenizer']
         sequence = kwargs['sequence']
         max_length = kwargs.get('max_length', 50)
         trust_remote_code = kwargs.get('trust_remote_code', False)
         apply_chat_template = kwargs.get('apply_chat_template',False)
         temperature= kwargs.get('temperature',0.1)
         stream = kwargs.get('stream',False)
-        if apply_chat_template == True:
-            if stream:
-                return Generate.generate_chat_output_from_model_stream(model, tokenizer, sequence, max_length,temperature = temperature,trust_remote_code=trust_remote_code)
-            return Generate.generate_chat_output_from_model(model, tokenizer, sequence, max_length,temperature = temperature,trust_remote_code=trust_remote_code)
-        try:
-            if type(model) == torch._dynamo.eval_frame.OptimizedModule:
-                print("generate_output using fast generation")
-                return GPTFast.generate_output_from_model(model, tokenizer, sequence, max_length)
-            else:
+        processor = kwargs.get('processor', None)
+        tokenizer = kwargs.get('tokenizer', None)
+        if processor is not None:
+            if apply_chat_template:
                 if stream:
-                    return Generate.generate_output_from_model_stream(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
-                return Generate.generate_output_from_model(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
-        except Exception as e:
-            print("Exception occurred while loading torch._dynamo.eval_frame.OptimizedModule")
-            print("generate_output using default generation")
-            if stream:
-                return Generate.generate_output_from_model_stream(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
-            return Generate.generate_output_from_model(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
+                    return Generate.generate_output_with_processor_stream(model, processor, sequence, max_length, temperature=temperature)
+                return Generate.generate_output_with_processor(model, processor, sequence, max_length, temperature=temperature)
+            else:
+                # If not chat, fallback to tokenizer if available
+                if tokenizer is not None:
+                    if stream:
+                        return Generate.generate_output_from_model_stream(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
+                    return Generate.generate_output_from_model(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
+                else:
+                    raise ValueError("Processor-based generation requires apply_chat_template=True or a tokenizer.")
+        else:
+            if tokenizer is not None:
+                if apply_chat_template:
+                    if stream:
+                        return Generate.generate_chat_output_from_model_stream(model, tokenizer, sequence, max_length,temperature = temperature,trust_remote_code=trust_remote_code)
+                    return Generate.generate_chat_output_from_model(model, tokenizer, sequence, max_length,temperature = temperature,trust_remote_code=trust_remote_code)
+                try:
+                    if type(model) == torch._dynamo.eval_frame.OptimizedModule:
+                        print("generate_output using fast generation")
+                        return GPTFast.generate_output_from_model(model, tokenizer, sequence, max_length)
+                    else:
+                        if stream:
+                            return Generate.generate_output_from_model_stream(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
+                        return Generate.generate_output_from_model(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
+                except Exception as e:
+                    print("Exception occurred while loading torch._dynamo.eval_frame.OptimizedModule")
+                    print("generate_output using default generation")
+                    if stream:
+                        return Generate.generate_output_from_model_stream(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
+                    return Generate.generate_output_from_model(model, tokenizer, sequence, max_length,trust_remote_code=trust_remote_code)
+            else:
+                raise ValueError("Must provide either a processor or tokenizer with model.")
     else:
         raise ValueError("Invalid arguments")
     
@@ -482,11 +503,12 @@ def text_generate(*args, **kwargs):
         draft_model_name (str, optional): The name of the draft model. Defaults to None.
         model (object): The model object.
         tokenizer (object): The tokenizer object.
+        processor (object): The processor object. Alternative to tokenizer for model-based generation. If provided, will be used for generation instead of tokenizer.
         apply_chat_template (bool, optional): Whether to apply the chat template. Defaults to False.
         temperature (float, optional): The temperature parameter for controlling the randomness of the generated output. Defaults to 0.1.
         gguf_file (str, optional): The path to the GGUF file. Defaults to None.
         auto_quantize (str, optional): Automatically apply quantization. Accepts "4bit"/"high" for high compression or "8bit"/"low" for lower compression. Defaults to None.
-        stream (bool, optional): Whether to stream the output. Defaults to False. if True, thread and streamer will be returned.
+        stream (bool, optional): Whether to stream the output. Defaults to False. If True, thread and streamer will be returned.
     Returns:
         str: The generated text.
         or
@@ -498,9 +520,24 @@ def text_generate(*args, **kwargs):
     if stream:
         thread, streamer = generate(*args, **kwargs)
         return thread, streamer
-    output, tokenizer = generate(*args, **kwargs)
-    print(tokenizer.decode(output[0], skip_special_tokens=True))
-    return tokenizer.decode(output[0], skip_special_tokens=True)
+    result = generate(*args, **kwargs)
+    # result can be (output, tokenizer) or (inputs, output, tokenizer) or (inputs, output, processor)
+    if isinstance(result, tuple):
+        if len(result) == 2:
+            output, tok = result
+            if hasattr(tok, 'decode'):
+                print(tok.decode(output[0], skip_special_tokens=True))
+                return tok.decode(output[0], skip_special_tokens=True)
+            else:
+                return output
+        elif len(result) == 3:
+            inputs, output, tok = result
+            if hasattr(tok, 'decode'):
+                print(tok.decode(output[0][inputs['input_ids'].size(1):], skip_special_tokens=True))
+                return tok.decode(output[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
+            else:
+                return output
+    return result
 def _handle_stream(thread, streamer):
     """Internal utility to handle streaming output."""
     thread.start()
@@ -529,6 +566,7 @@ def text_stream(*args, **kwargs):
         draft_model_name (str, optional): The name of the draft model. Defaults to None.
         model (object): The model object.
         tokenizer (object): The tokenizer object.
+        processor (object): The processor object. Alternative to tokenizer for model-based generation. If provided, will be used for generation instead of tokenizer.
         apply_chat_template (bool, optional): Whether to apply the chat template. Defaults to False.
         temperature (float, optional): The temperature parameter for controlling the randomness of the generated output. Defaults to 0.1.
         gguf_file (str, optional): The path to the GGUF file. Defaults to None.
@@ -568,11 +606,11 @@ def chat_generate(*args, **kwargs):
         draft_model_name (str, optional): The name of the draft model. Defaults to None.
         model (object): The model object.
         tokenizer (object): The tokenizer object.
+        processor (object): The processor object. Alternative to tokenizer for model-based generation. If provided, will be used for generation instead of tokenizer.
         apply_chat_template (bool, optional): Whether to apply the chat template. Defaults to False.
         temperature (float, optional): The temperature parameter for controlling the randomness of the generated output. Defaults to 0.1.
         gguf_file (str, optional): The path to the GGUF file. Defaults to None.
         auto_quantize (str, optional): Automatically apply quantization. Accepts "4bit"/"high" for high compression or "8bit"/"low" for lower compression. Defaults to None.
-    
     Returns:
         str: The generated chat .
         or
@@ -586,10 +624,16 @@ def chat_generate(*args, **kwargs):
         thread, streamer = generate(*args, **kwargs)
         return thread, streamer
     dont_print_output = kwargs.get('dont_print_output', False)
-    input,output, tokenizer = generate(*args, **kwargs)
-    if not dont_print_output:
-        print(tokenizer.decode(output[0][input['input_ids'].size(1):], skip_special_tokens=True))
-    return tokenizer.decode(output[0][input['input_ids'].size(1):], skip_special_tokens=True)
+    result = generate(*args, **kwargs)
+    if isinstance(result, tuple) and len(result) == 3:
+        inputs, output, tok = result
+        if hasattr(tok, 'decode'):
+            if not dont_print_output:
+                print(tok.decode(output[0][inputs['input_ids'].size(1):], skip_special_tokens=True))
+            return tok.decode(output[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
+        else:
+            return output
+    return result
 def chat_stream(*args, **kwargs):
     """
     stream text using the specified arguments.
@@ -607,6 +651,7 @@ def chat_stream(*args, **kwargs):
         draft_model_name (str, optional): The name of the draft model. Defaults to None.
         model (object): The model object.
         tokenizer (object): The tokenizer object.
+        processor (object): The processor object. Alternative to tokenizer for model-based generation. If provided, will be used for generation instead of tokenizer.
         apply_chat_template (bool, optional): Whether to apply the chat template. Defaults to False.
         temperature (float, optional): The temperature parameter for controlling the randomness of the generated output. Defaults to 0.1.
         gguf_file (str, optional): The path to the GGUF file. Defaults to None.
@@ -617,7 +662,6 @@ def chat_stream(*args, **kwargs):
         or
         thread: The thread object.
         streamer: The streamer object.
-
     """
     kwargs['stream'] = True
     kwargs['apply_chat_template'] = True
@@ -701,4 +745,18 @@ def load_tokenizer(tokenizer_path,gguf_file=None):
     """
     return Generate.load_tokenizer(tokenizer_path,gguf_file=gguf_file)
  
+def load_processor(processor_path,use_fast=False,gguf_file=None):
+    """
+    Loads a processor from the specified path.
+
+    Args:
+        processor_path (str): The path to the processor file.
+        use_fast (bool, optional): Whether to use fast processing. Defaults to False.
+        gguf_file (str, optional): The path to the GGUF file. Defaults to None.
+
+    Returns:
+        processor: The loaded processor object.
+    """
+    return Generate.load_processor(processor_path,use_fast=use_fast,gguf_file=gguf_file)
+