From 4d81db938b248d6ba2fb5c8cb5d6b749d35fae13 Mon Sep 17 00:00:00 2001
From: Delsy-Kinyuy <kinyuydelsy6@gmail.com>
Date: Wed, 24 Sep 2025 03:24:00 +0100
Subject: [PATCH 1/6] Completed Exercise 1 - Hello LLM Endpoint

---
 done.txt     |  6 ++++++
 exercise1.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 done.txt
 create mode 100644 exercise1.py

diff --git a/done.txt b/done.txt
new file mode 100644
index 0000000..2a45f56
--- /dev/null
+++ b/done.txt
@@ -0,0 +1,6 @@
+NameName: Delsy Kinyuy
+Exercise: 1 - Hello LLM Endpoint
+Challenges: 
+- Installing transformers took some time
+- Model download was large on first run
+ Needed to adjust max_length to avoid errors
diff --git a/exercise1.py b/exercise1.py
new file mode 100644
index 0000000..eddbf8d
--- /dev/null
+++ b/exercise1.py
@@ -0,0 +1,30 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import pipeline
+
+# init FastAPI
+app = FastAPI(title="Generative AI Exercises")
+
+# load Hugging Face model (distilgpt2 is small enough to run on CPU)
+generator = pipeline("text-generation", model="distilgpt2")
+
+# request schema
+class PromptRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 100
+
+@app.get("/")
+def root():
+    return {"message": "FastAPI + Hugging Face is live 🚀"}
+
+@app.post("/hello-llm")
+def hello_llm(request: PromptRequest):
+    """
+    Generate text from a given prompt using distilgpt2.
+    """
+    output = generator(
+        request.prompt,
+        max_length=len(request.prompt.split()) + request.max_tokens,
+        num_return_sequences=1
+    )
+    return {"prompt": request.prompt, "generated": output[0]["generated_text"]}

From c968d23afcdcf30f5c3eb25042936073791a3ecd Mon Sep 17 00:00:00 2001
From: Delsy-Kinyuy <kinyuydelsy6@gmail.com>
Date: Thu, 25 Sep 2025 03:43:35 +0100
Subject: [PATCH 2/6] Completed Exercise 1 - Hello LLM Endpoint

---
 exercise1.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/exercise1.py b/exercise1.py
index eddbf8d..0a7c644 100644
--- a/exercise1.py
+++ b/exercise1.py
@@ -28,3 +28,21 @@ def hello_llm(request: PromptRequest):
         num_return_sequences=1
     )
     return {"prompt": request.prompt, "generated": output[0]["generated_text"]}
+# answer interview questions
+# **Interview Questions:**
+
+# 1. What is a language model?
+# answer: a language model is a statistical tool that predicts the next word in a sequence based on the words that came before it. It is trained on large datasets of text to learn patterns, grammar, and context, enabling it to generate coherent and contextually relevant text.
+# 2. How does GPT-2 differ from GPT-3/4?
+# answer: GPT-2 is smaller and less powerful than GPT-3/4, with fewer parameters and less training data. GPT-3/4 can generate more coherent and contextually relevant text, handle more complex tasks, and understand nuanced prompts better than GPT-2.
+# 3. Why is `distilgpt2` considered lightweight?
+# answer: `distilgpt2` is a distilled version of GPT-2, meaning it has been compressed to reduce its size and computational requirements while retaining much of the original model's performance. This makes it more efficient and faster to run, especially on hardware with limited resources.
+# 4. What are tokens, and why do they matter in LLMs?
+# answer: Tokens are the basic units of text that a language model processes, which can be words, subwords, or characters. They matter because LLMs have limits on the number of tokens they can handle in a single input or output, affecting the model's ability to understand and generate text effectively.
+# 5. How do you handle prompt length limits?
+# answer: To handle prompt length limits, you can truncate or summarize the input text to fit within the model's maximum token limit. Additionally, you can use techniques like sliding windows for longer texts or break down the input into smaller, manageable chunks.
+# 6. Why expose models through an API instead of CLI?
+# answer: Exposing models through an API allows for easier integration with various applications, enabling remote access and scalability. It also provides a more user-friendly interface for developers and users who may not be comfortable with command-line interfaces (CLI).
+# 7. What’s the risk of directly exposing LLMs without moderation?
+# answer: Directly exposing LLMs without moderation can lead to the generation of harmful, biased, or inappropriate content. LLMs may inadvertently produce offensive language, misinformation, or content that violates ethical guidelines, which can harm users and damage the reputation of the service provider.
+

From 0a34165c49b138215c623aa2a81fb984b0e7747f Mon Sep 17 00:00:00 2001
From: Delsy-Kinyuy <kinyuydelsy6@gmail.com>
Date: Thu, 25 Sep 2025 04:00:05 +0100
Subject: [PATCH 3/6] Completed Exercise 2 - Text Summarizer API

---
 exercise2.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 exercise2.py

diff --git a/exercise2.py b/exercise2.py
new file mode 100644
index 0000000..c19e9ca
--- /dev/null
+++ b/exercise2.py
@@ -0,0 +1,60 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import pipeline
+
+app = FastAPI(title="Generative AI Exercises")
+
+# ---- Load models once on startup ----
+generator = pipeline("text-generation", model="distilgpt2")
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+
+# ---- Schemas ----
+class PromptRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 50
+
+class SummarizeRequest(BaseModel):
+    text: str
+    max_tokens: int = 130
+    min_tokens: int = 40
+
+# ---- Routes ----
+@app.get("/")
+def root():
+    return {"message": "FastAPI + Hugging Face is live 🚀"}
+
+@app.post("/hello-llm")
+def hello_llm(request: PromptRequest):
+    output = generator(
+        request.prompt,
+        max_length=len(request.prompt.split()) + request.max_tokens,
+        num_return_sequences=1
+    )
+    return {"prompt": request.prompt, "generated": output[0]["generated_text"]}
+
+@app.post("/summarize")
+def summarize(request: SummarizeRequest):
+    summary = summarizer(
+        request.text,
+        max_length=request.max_tokens,
+        min_length=request.min_tokens,
+        do_sample=False
+    )
+    return {"summary": summary[0]["summary_text"]}
+
+# **Interview Questions:**
+
+# 1. What is abstractive vs extractive summarization?
+# answer: Abstractive summarization generates new phrases and sentences to capture the main ideas of the text, while extractive summarization selects and compiles key sentences or phrases directly from the original text.
+# 2. Why is BART good for summarization?
+# answer: BART is effective for summarization because it combines a bidirectional encoder (like BERT) with a left-to-right decoder (like GPT), allowing it to understand context and generate coherent summaries. Its pre-training on large text corpora helps it learn language patterns, making it adept at producing fluent and relevant summaries.
+# 3. What are encoder-decoder architectures?
+# answer: Encoder-decoder architectures consist of two main components: an encoder that processes the input data and encodes it into a fixed-size representation, and a decoder that takes this representation and generates the output sequence. This architecture is commonly used in tasks like machine translation and text summarization.
+# 4. How does beam search affect summary quality?
+# answer: Beam search improves summary quality by exploring multiple possible output sequences simultaneously, allowing the model to consider various options and select the most probable one. This leads to more coherent and contextually relevant summaries compared to greedy decoding, which only considers the most likely next word at each step.
+# 5. What are hallucinations in summarization?
+# answer: Hallucinations in summarization refer to instances where the model generates information that is not present in the original text, leading to inaccuracies or misleading content in the summary. This can occur when the model overgeneralizes or misinterprets the input data.
+# 6. What evaluation metrics exist (ROUGE, BLEU)?
+# answer: Evaluation metrics for summarization include ROUGE (Recall-Oriented Understudy for Gisting Evaluation), which measures the overlap of n-grams, word sequences, and word pairs between the generated summary and reference summaries. BLEU (Bilingual Evaluation Understudy) is another metric that evaluates the quality of text by comparing it to one or more reference texts, focusing on precision of n-grams.
+# 7. How would you fine-tune BART on legal documents?
+# answer: To fine-tune BART on legal documents, I would first gather a large dataset of legal texts and their corresponding summaries. Then, I would preprocess the data to ensure it is clean and formatted correctly. Next, I would use transfer learning to fine-tune the pre-trained BART model on this dataset, adjusting hyperparameters such as learning rate and batch size to optimize performance. Finally, I would evaluate the model using relevant metrics like ROUGE to ensure it generates accurate and coherent summaries of legal documents.

From 8124618ce725cbad9d4e67e6fbea1269a3de2f39 Mon Sep 17 00:00:00 2001
From: Delsy-Kinyuy <kinyuydelsy6@gmail.com>
Date: Thu, 25 Sep 2025 04:11:46 +0100
Subject: [PATCH 4/6] Completed Exercise 3 - Sentiment Analysis API

---
 done.txt     | 13 +++++-----
 exercise3.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 6 deletions(-)
 create mode 100644 exercise3.py

diff --git a/done.txt b/done.txt
index 2a45f56..e2505fc 100644
--- a/done.txt
+++ b/done.txt
@@ -1,6 +1,7 @@
-NameName: Delsy Kinyuy
-Exercise: 1 - Hello LLM Endpoint
-Challenges: 
-- Installing transformers took some time
-- Model download was large on first run
- Needed to adjust max_length to avoid errors
+Name: Delsy Kinyuy
+Exercise: 3 - Sentiment Analysis API
+Challenges:
+- Model loads fast, but first run needed internet to fetch weights
+- Sentiment labels limited to POSITIVE/NEGATIVE (no neutral)
+- Sarcasm detection is tricky, outputs can be misleading
+
diff --git a/exercise3.py b/exercise3.py
new file mode 100644
index 0000000..aed8ad5
--- /dev/null
+++ b/exercise3.py
@@ -0,0 +1,71 @@
+# main.py (extend from Exercises 1 & 2)
+
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import pipeline
+
+app = FastAPI(title="Generative AI Exercises")
+
+# ---- Load models ----
+generator = pipeline("text-generation", model="distilgpt2")
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+
+# ---- Schemas ----
+class PromptRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 50
+
+class SummarizeRequest(BaseModel):
+    text: str
+    max_tokens: int = 130
+    min_tokens: int = 30
+
+class SentimentRequest(BaseModel):
+    text: str
+
+# ---- Routes ----
+@app.get("/")
+def root():
+    return {"message": "FastAPI + Hugging Face is live 🚀"}
+
+@app.post("/hello-llm")
+def hello_llm(request: PromptRequest):
+    output = generator(
+        request.prompt,
+        max_length=len(request.prompt.split()) + request.max_tokens,
+        num_return_sequences=1
+    )
+    return {"prompt": request.prompt, "generated": output[0]["generated_text"]}
+
+@app.post("/summarize")
+def summarize(request: SummarizeRequest):
+    summary = summarizer(
+        request.text,
+        max_length=request.max_tokens,
+        min_length=request.min_tokens,
+        do_sample=False
+    )
+    return {"summary": summary[0]["summary_text"]}
+
+@app.post("/sentiment")
+def sentiment(request: SentimentRequest):
+    result = sentiment_analyzer(request.text)
+    return {"text": request.text, "sentiment": result[0]}
+
+# **Interview Questions:**
+
+# 1. What is transfer learning in NLP?
+# answer: Transfer learning in NLP involves taking a pre-trained model (trained on a large corpus of text) and fine-tuning it on a specific task or dataset. This allows the model to leverage learned language representations, reducing the need for large amounts of task-specific data and improving performance.
+# 2. Why use DistilBERT instead of BERT?
+# answer: DistilBERT is a smaller, faster, and more efficient version of BERT that retains about 97% of BERT's performance while being 60% faster and having 40% fewer parameters. This makes it more suitable for deployment in resource-constrained environments or applications requiring lower latency.
+# 3. What dataset is SST-2?
+# answer: SST-2 (Stanford Sentiment Treebank) is a dataset used for sentiment analysis that contains movie reviews labeled as positive or negative. It is widely used for training and evaluating sentiment classification models.
+# 4. What are embeddings in classification?
+# answer: Embeddings are dense vector representations of words or phrases that capture semantic meaning and relationships. In classification tasks, embeddings serve as input features for machine learning models, allowing them to understand the context and nuances of the text data.
+# 5. How do you evaluate classification performance?
+# answer: Classification performance can be evaluated using metrics such as accuracy, precision, recall, F1-score, and confusion matrix. The choice of metric depends on the specific task and the importance of false positives vs. false negatives.
+# 6. What biases can exist in sentiment models?
+# answer: Sentiment models can exhibit biases based on the training data, such as
+# 7. How would you handle sarcasm in sentiment detection?
+# answer: Sarcasm can be challenging for sentiment detection as it often involves saying the opposite of what is meant. To handle sarcasm, one could use more sophisticated models that consider context, tone, and user behavior. Additionally, incorporating datasets specifically labeled for sarcasm can help improve model performance in detecting sarcastic remarks.
\ No newline at end of file

From 5499d82806ed0d0e7e81fd851a0876d8911e8d3f Mon Sep 17 00:00:00 2001
From: Delsy-Kinyuy <kinyuydelsy6@gmail.com>
Date: Sat, 27 Sep 2025 01:50:18 +0100
Subject: [PATCH 5/6] Completed Exercise 4 - Multimodal Image Captioning

---
 exercise4.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 exercise4.py

diff --git a/exercise4.py b/exercise4.py
new file mode 100644
index 0000000..40ccf53
--- /dev/null
+++ b/exercise4.py
@@ -0,0 +1,82 @@
+# main.py (extend from Exercises 1–3)
+
+from fastapi import FastAPI, UploadFile, File
+from pydantic import BaseModel
+from transformers import pipeline
+from PIL import Image
+import io
+
+app = FastAPI(title="Generative AI Exercises")
+
+# ---- Load models ----
+generator = pipeline("text-generation", model="distilgpt2")
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
+
+# ---- Schemas ----
+class PromptRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 50
+
+class SummarizeRequest(BaseModel):
+    text: str
+    max_tokens: int = 130
+    min_tokens: int = 30
+
+class SentimentRequest(BaseModel):
+    text: str
+
+# ---- Routes ----
+@app.get("/")
+def root():
+    return {"message": "FastAPI + Hugging Face is live 🚀"}
+
+@app.post("/hello-llm")
+def hello_llm(request: PromptRequest):
+    output = generator(
+        request.prompt,
+        max_length=len(request.prompt.split()) + request.max_tokens,
+        num_return_sequences=1
+    )
+    return {"prompt": request.prompt, "generated": output[0]["generated_text"]}
+
+@app.post("/summarize")
+def summarize(request: SummarizeRequest):
+    summary = summarizer(
+        request.text,
+        max_length=request.max_tokens,
+        min_length=request.min_tokens,
+        do_sample=False
+    )
+    return {"summary": summary[0]["summary_text"]}
+
+@app.post("/sentiment")
+def sentiment(request: SentimentRequest):
+    result = sentiment_analyzer(request.text)
+    return {"text": request.text, "sentiment": result[0]}
+
+@app.post("/caption-image")
+async def caption_image(file: UploadFile = File(...)):
+    contents = await file.read()
+    image = Image.open(io.BytesIO(contents)).convert("RGB")
+    caption = captioner(image)
+    return {"filename": file.filename, "caption": caption[0]["generated_text"]}
+
+
+# **Interview Questions:**
+
+# 1. How does ViT process images?
+# answeR: ViT splits images into patches, processes them as sequences, and uses self-attention to capture relationships.
+# 2. What role does GPT-2 play in captioning?
+# answer: GPT-2 generates coherent text based on the visual features extracted by the vision encoder.
+# 3. Why combine a vision encode=r with a language decoder?
+# answer: Combining them allows the model to understand visual content and generate relevant textual descriptions.
+# 4. What datasets are used for captioning?
+# answer: Common datasets include MS COCO, Flickr8k, and Flickr30k.
+# 5. What challenges exist in image captioning?
+# answer: Challenges include understanding context, handling diverse objects, and generating natural language.
+# 6. How do you evaluate captions (BLEU, CIDEr)?
+# answer: BLEU measures n-gram overlap, while CIDEr evaluates consensus with multiple references.
+# 7. What real-world apps use capti=oning?
+# answer: Real-world applications include accessibility tools, content management, and social media platforms.

From 5f4a8fbccec2bf3d17d28392cdc585d9ce944d6c Mon Sep 17 00:00:00 2001
From: Delsy-Kinyuy <kinyuydelsy6@gmail.com>
Date: Sat, 27 Sep 2025 01:54:05 +0100
Subject: [PATCH 6/6] Completed Exercise 5 - Naive RAG with Chroma + LangChain

---
 exercise5.py | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 exercise5.py

diff --git a/exercise5.py b/exercise5.py
new file mode 100644
index 0000000..1c5f127
--- /dev/null
+++ b/exercise5.py
@@ -0,0 +1,121 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import pipeline
+from PIL import Image
+import io
+
+# LangChain + Chroma
+from langchain.vectorstores import Chroma
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.chains import RetrievalQA
+from langchain.llms import HuggingFacePipeline
+
+app = FastAPI(title="Generative AI Exercises")
+
+# ---- Hugging Face pipelines ----
+generator = pipeline("text-generation", model="distilgpt2")
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
+
+# Wrap distilgpt2 for LangChain
+llm = HuggingFacePipeline(pipeline=generator)
+
+# ---- RAG Setup (Chroma) ----
+# Sample documents (in practice you’d load PDFs, Markdown, etc.)
+docs = [
+    "FastAPI is a modern, fast web framework for building APIs with Python.",
+    "LangChain is a framework for developing applications powered by language models.",
+    "Chroma is an open-source embedding database for building retrieval-augmented generation (RAG) systems."
+]
+
+# Split into chunks
+splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
+documents = splitter.create_documents(docs)
+
+# Embeddings
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+
+# Create Chroma vector store
+vectorstore = Chroma.from_documents(documents, embeddings, collection_name="exercise5")
+
+# Build RAG chain
+qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())
+
+# ---- Schemas ----
+from fastapi import UploadFile, File
+
+class PromptRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 50
+
+class SummarizeRequest(BaseModel):
+    text: str
+    max_tokens: int = 130
+    min_tokens: int = 30
+
+class SentimentRequest(BaseModel):
+    text: str
+
+class RAGRequest(BaseModel):
+    query: str
+
+# ---- Routes ----
+@app.get("/")
+def root():
+    return {"message": "FastAPI + Hugging Face + LangChain is live 🚀"}
+
+@app.post("/hello-llm")
+def hello_llm(request: PromptRequest):
+    output = generator(
+        request.prompt,
+        max_length=len(request.prompt.split()) + request.max_tokens,
+        num_return_sequences=1
+    )
+    return {"prompt": request.prompt, "generated": output[0]["generated_text"]}
+
+@app.post("/summarize")
+def summarize(request: SummarizeRequest):
+    summary = summarizer(
+        request.text,
+        max_length=request.max_tokens,
+        min_length=request.min_tokens,
+        do_sample=False
+    )
+    return {"summary": summary[0]["summary_text"]}
+
+@app.post("/sentiment")
+def sentiment(request: SentimentRequest):
+    result = sentiment_analyzer(request.text)
+    return {"text": request.text, "sentiment": result[0]}
+
+@app.post("/caption-image")
+async def caption_image(file: UploadFile = File(...)):
+    contents = await file.read()
+    image = Image.open(io.BytesIO(contents)).convert("RGB")
+    caption = captioner(image)
+    return {"filename": file.filename, "caption": caption[0]["generated_text"]}
+
+@app.post("/rag-query")
+def rag_query(request: RAGRequest):
+    answer = qa.run(request.query)
+    return {"query": request.query, "answer": answer}
+
+
+# **Interview Questions:**
+
+# 1. What is RAG and why is it useful?
+# answer: RAG (Retrieval-Augmented Generation) is a technique that combines retrieval of relevant documents with generative models to produce more accurate and contextually relevant responses. It is useful because it allows the model to access up-to-date information and reduces the need for extensive fine-tuning on specific datasets.
+# 2. How do embeddings represent meaning?
+# answer: Embeddings are dense vector representations of text that capture semantic meaning by placing similar concepts closer together in the vector space. They are generated using models trained on large corpora, allowing them to understand relationships between words and phrases based on context.
+# 3. Why use Chroma as a vector DB?
+# answer: Chroma is an open-source, efficient, and scalable vector database that allows for fast similarity searches and easy integration with various embedding models. It is designed to handle large-scale datasets and provides features like persistence and indexing, making it suitable for RAG applications.
+# 4. What is cosine similarity in retrieval?
+# answer: Cosine similarity is a metric used to measure the similarity between two non-zero vectors by calculating the cosine of the angle between them. In retrieval, it helps identify how similar a query vector is to document vectors in the embedding space, allowing for effective ranking of relevant documents.    
+# 5. How do you update a knowledge base?
+# answer: A knowledge base can be updated by adding new documents, re-embedding the content, and updating the vector store. This may involve re-indexing or incrementally adding new embeddings to ensure the retrieval system reflects the most current information.
+# 6. What is the risk of injecting irrelevant documents?
+# answer: Injecting irrelevant documents can lead to poor retrieval results, as the model may retrieve and generate responses based on unrelated or incorrect information. This can degrade the quality of answers and reduce user trust in the system.
+# 7. How does RAG differ from fine-tuning?
+# answer: RAG leverages external knowledge through retrieval, allowing the model to access a broader range of information without needing to modify the model's parameters. Fine-tuning, on the other hand, involves adjusting the model's weights on a specific dataset, which can be time-consuming and may lead to overfitting on that dataset. RAG provides more flexibility and adaptability to new information.