iws3 · NJEI03 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/1-hello-llm/__pycache__/main.cpython-313.pyc b/1-hello-llm/__pycache__/main.cpython-313.pyc
diff --git a/1-hello-llm/interview_qa.txt b/1-hello-llm/interview_qa.txt
@@ -0,0 +1,14 @@
+1-What is a language model? A program that predicts the next most likely word (token) in a sequence.
+
+2-GPT-2 vs GPT-3/4? GPT-3/4 are much larger, trained on more data, and smarter. distilgpt2 is a simplified GPT-2.
+
+3-Why is distilgpt2 lightweight? It's a "distilled" version—a smaller model trained to imitate the bigger one. Fewer parameters = faster, less memory.
+
+4-What are tokens? Words or pieces of words the model understands. The model thinks in tokens, not full words.
+
+5-Prompt length limits? Models have a max token limit. We use max_length to control the output. If the prompt is too long, you must shorten it.
+
+6-API vs CLI? An API lets other programs (like a website or app) use your model easily. A CLI is just for you on your computer.
+
+7-Risk without moderation? The model might generate bad, biased, or private information from its training data. You need filters.
+
diff --git a/1-hello-llm/main.py b/1-hello-llm/main.py
@@ -0,0 +1,30 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import pipeline  #Easiest way to use
+
+#Creating the app
+app = FastAPI()
+
+#Loading the model once the app starts
+generator = pipeline('text-generation', model='distilgpt2')
+
+#Defining wjat my API expects, which is a prompt string
+
+class Request(BaseModel):
+    prompt: str
+
+# Creating the endpoint
+@app.post('/hello-llm')
+def generate_text(request: Request):
+    #Calling the model
+    result = generator(request.prompt, max_length=100,
+    num_return_sequences=1, 
+    temperature=0.7, repetition_penalty=1.5, do_sample=True)
+    print(result)
+    # Return the generated text
+    return{
+        "generated_text": result[0]['generated_text']
+    }
+
+
+
diff --git a/2-Text Summarizer API/__pycache__/main.cpython-313.pyc b/2-Text Summarizer API/__pycache__/main.cpython-313.pyc
diff --git a/2-Text Summarizer API/interview_qa.txt b/2-Text Summarizer API/interview_qa.txt
@@ -0,0 +1,13 @@
+1-Abstractive vs Extractive? Extractive copies important sentences. Abstractive writes new sentences to capture the meaning (like a human). BART is abstractive.
+
+2- Why is BART good for this? It's an encoder-decoder model trained to reconstruct corrupted text, which is perfect for tasks like summarization.
+
+3- Encoder-Decoder? The encoder reads and understands the input text. The decoder uses that understanding to write a new sequence (the summary).
+
+4- Beam Search? A search strategy that keeps several likely options open instead of just picking the next best word. Can lead to better, more coherent summaries.
+
+5- Hallucinations? When the model generates information that isn't in the source text. A major risk in summarization. Leads to inaccuracy
+
+6- ROUGE/BLEU? Metrics to compare a machine-generated summary to a human-written one. They measure overlap of words and phrases.
+
+7- Fine-tuning on legal docs? You would collect a dataset of legal documents and their human-written summaries, then continue training the model on this specific data.
diff --git a/2-Text Summarizer API/main.py b/2-Text Summarizer API/main.py
@@ -0,0 +1,26 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import pipeline
+
+app = FastAPI()
+
+#Loading the summarization pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+
+#Define input:  Now we need a longer text field
+class SummaryRequest(BaseModel):
+    text: str
+
+@app.post('/summarize')
+def summarize_text(request:SummaryRequest):
+    #Calling the model. We must handle long text by truncating within the model's limits
+    result = summarizer(
+        request.text,
+        max_length=130, #summary length
+        min_length=30,
+        do_sample=False #for summary to be deterministic
+    )
+    return{
+        "summary": result[0]['summary_text']
+    }
+#Model is about 1.6GB
diff --git a/3-Sentiment Analysis API/__pycache__/main.cpython-313.pyc b/3-Sentiment Analysis API/__pycache__/main.cpython-313.pyc
diff --git a/3-Sentiment Analysis API/interview_qa.txt b/3-Sentiment Analysis API/interview_qa.txt
@@ -0,0 +1,13 @@
+1- Transfer Learning? Training a model on a general task (like language understanding) then fine-tuning it for a specific task (like sentiment). Much faster than training from scratch.
+
+2- DistilBERT vs BERT? DistilBERT is 40% smaller but retains 97% of BERT's performance. Faster and cheaper to run.
+
+3-SST-2? A standard benchmark dataset of movie reviews labeled as positive or negative.
+
+4-Embeddings? Numbers that represent the meaning of words. The model converts your text into embeddings to understand it.
+
+5-Evaluation? Accuracy, precision, recall, F1-score. For sentiment: how often is it correct?
+
+6-Biases? If trained mostly on movie reviews, it might perform poorly on technical texts or miss cultural context.
+
+7- Sarcasm? Very hard for models. Requires deep context understanding. Often needs additional context or specialized training.
diff --git a/3-Sentiment Analysis API/main.py b/3-Sentiment Analysis API/main.py
@@ -0,0 +1,22 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import pipeline
+
+app = FastAPI()
+#Load the sentimnt analysis model
+
+classifier = pipeline("sentiment-analysis", model ="distilbert-base-uncased-finetuned-sst-2-english")
+
+class SentimentRequest(BaseModel):
+    text: str
+
+@app.post("/sentiment")
+def analyze_sentiment(request:SentimentRequest ):
+    # Modal reruns a list of results for our unique input
+    result = classifier(request.text)[0]
+
+    return{
+        "sentiment": result['label'],
+        "confidence": round(result['score'], 4) # round to 4 decimal places
+    } 
+#Model is 269MB
diff --git a/4-Multimodal image captioning API/interview_qa.txt b/4-Multimodal image captioning API/interview_qa.txt
@@ -0,0 +1,13 @@
+1- How does ViT process images? It splits images into patches (like puzzle pieces) and processes them like a language model processes words.
+
+2- GPT-2's role? It takes the image understanding from ViT and generates a coherent sentence description.
+
+3- Why combine vision + language? ViT understands what's in the image, GPT-2 knows how to describe it naturally.
+
+4- Captioning datasets? COCO - contains images with multiple human-written captions.
+
+5- Challenges? Handling fine details, counting objects accurately, understanding abstract concepts.
+
+6- Evaluation metrics? BLEU (word overlap), CIDEr (considers human-like relevance).
+
+7- Real-world apps? Accessibility (describing images for visually impaired), social media auto-alt-text, surveillance systems.
diff --git a/4-Multimodal image captioning API/main.py b/4-Multimodal image captioning API/main.py
@@ -0,0 +1,20 @@
+from fastapi import FastAPI, File
+from PIL import Image
+from transformers import pipeline
+
+app = FastAPI()
+#Loading the model
+captioner = pipeline("image-to-text" , model="nlpconnect/vit-gpt2-image-captioning")
+
+app.post("/caption-image")
+async def caption_image(file: UploadFile = File(...)):
+    #reading the uploaded images
+    image= Image.open(file.file)
+
+    #Generating caption
+    result=captioner(image)
+
+    return{
+        "caption": result[0]['generated_text']
+    }
+#Model is about 500MBs
diff --git a/5-Naive RAG system/__pycache__/doc_loader.cpython-313.pyc b/5-Naive RAG system/__pycache__/doc_loader.cpython-313.pyc
diff --git a/5-Naive RAG system/__pycache__/document_loader.cpython-313.pyc b/5-Naive RAG system/__pycache__/document_loader.cpython-313.pyc
diff --git a/5-Naive RAG system/__pycache__/main.cpython-313.pyc b/5-Naive RAG system/__pycache__/main.cpython-313.pyc
diff --git a/5-Naive RAG system/__pycache__/rag.cpython-313.pyc b/5-Naive RAG system/__pycache__/rag.cpython-313.pyc
diff --git a/5-Naive RAG system/data/cameroon.txt b/5-Naive RAG system/data/cameroon.txt
@@ -0,0 +1,74 @@
+
+
+# Cameroon – Comprehensive Overview
+
+## 1. Geography and Location
+
+Cameroon is a country in Central Africa, often called "Africa in miniature" because of its geological, linguistic, and cultural diversity. It shares borders with Nigeria to the west, Chad to the northeast, the Central African Republic to the east, and Equatorial Guinea, Gabon, and the Republic of the Congo to the south. It has a coastline along the Atlantic Ocean (Gulf of Guinea).
+
+* **Capital city:** Yaoundé
+* **Largest city and economic hub:** Douala
+* **Climate:** Ranges from equatorial in the south, tropical in the center, to semi-arid in the north.
+* **Notable landmarks:** Mount Cameroon (the highest peak in West and Central Africa), Lake Nyos, and Waza National Park.
+
+## 2. History
+
+* **Pre-colonial era:** Inhabited by diverse ethnic groups, kingdoms, and chiefdoms such as the Bamileke, Bamoun, and Fulani emirates.
+* **Colonial period:** First colonized by Germany in 1884 (Kamerun). After World War I, the territory was divided between France and Britain under League of Nations mandates.
+* **Independence:** French Cameroon gained independence in 1960 as the Republic of Cameroon. In 1961, part of British Southern Cameroons joined to form the Federal Republic of Cameroon. The system later shifted to a unitary state in 1972.
+* **Modern era:** Cameroon has faced political challenges, including governance under President Paul Biya since 1982, and ongoing Anglophone crises in the Northwest and Southwest regions.
+
+## 3. Politics and Governance
+
+* **System:** Unitary presidential republic.
+* **President:** Head of state and government, commander-in-chief, with extensive executive powers.
+* **Parliament:** Bicameral, consisting of the National Assembly and Senate.
+* **Judiciary:** Formally independent but influenced by the executive branch.
+* **Challenges:** Limited political pluralism, concerns about democracy, human rights issues, and regional conflicts.
+
+## 4. Economy
+
+Cameroon has a mixed economy with agriculture, oil, and services as major contributors.
+
+* **Agriculture:** Cocoa, coffee, bananas, cotton, maize, cassava, palm oil.
+* **Natural resources:** Oil, gas, timber, bauxite, iron ore.
+* **Industry and services:** Douala port is the main economic hub; Yaoundé houses administration and diplomatic services.
+* **Currency:** Central African CFA franc (XAF).
+* **Economic challenges:** Poverty, corruption, limited infrastructure, and dependence on oil revenue.
+* **Opportunities:** Expanding digital economy, agriculture modernization, tourism potential.
+
+## 5. Demographics
+
+* **Population:** Over 27 million people.
+* **Languages:** Official languages are French and English, reflecting colonial history. Over 250 local languages are spoken (e.g., Ewondo, Fulfulde, Duala).
+* **Ethnic groups:** Diverse, including Bantu groups, Semi-Bantu, Sudanese groups, Fulani, and Pygmy communities.
+* **Religion:** Christianity (Roman Catholic and Protestant), Islam, and indigenous beliefs.
+
+## 6. Culture
+
+Cameroon is culturally rich and diverse.
+
+* **Music:** Makossa, Bikutsi, Njang, and modern Afropop.
+* **Cuisine:** Ndolé (vegetable stew with peanuts and fish or beef), Achu soup, roasted fish with plantains, puff-puff.
+* **Festivals:** Ngondo festival (Douala), cultural dances, and traditional ceremonies.
+* **Sports:** Football is the most popular sport. The men’s national team, the Indomitable Lions, has won multiple Africa Cup of Nations and performed strongly in FIFA World Cups.
+
+## 7. Education and Health
+
+* **Education system:** Primary, secondary, and tertiary education structured under Francophone and Anglophone subsystems. Universities in Yaoundé, Buea, Douala, Bamenda, and others serve as higher learning centers.
+* **Health challenges:** Malaria, HIV/AIDS, maternal and child health issues. Recent improvements include vaccination campaigns and private healthcare initiatives.
+
+## 8. Regional and International Relations
+
+* **Membership:** United Nations, African Union, Commonwealth of Nations, Organisation Internationale de la Francophonie, and Central African Economic and Monetary Community (CEMAC).
+* **Regional influence:** Cameroon plays a key role in Central Africa but faces tensions with Nigeria over the Bakassi Peninsula (settled by ICJ ruling in 2002).
+
+## 9. Current Issues
+
+* Political unrest and demands for decentralization or federalism in Anglophone regions.
+* Economic diversification efforts to reduce dependence on oil.
+* Youth unemployment and brain drain.
+* Climate change impacts on agriculture and Lake Chad basin.
+
+---
+
diff --git a/5-Naive RAG system/document_loader.py b/5-Naive RAG system/document_loader.py
@@ -0,0 +1,36 @@
+import os
+from langchain_community.document_loaders import TextLoader, PyPDFLoader
+from langchain.text_splitter import CharacterTextSplitter
+
+def load_documents_from_folder(folder_path: str = "data"):
+    """Load all PDF and TXT files from the data folder"""
+    all_docs = []
+
+    if not os.path.exists(folder_path):
+        os.makedirs(folder_path)
+        return all_docs
+
+    for filename in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, filename)
+
+        try:
+            if filename.endswith('.pdf'):
+                loader = PyPDFLoader(file_path)
+                documents = loader.load()
+            elif filename.endswith('.txt'):
+                loader = TextLoader(file_path, encoding='utf-8')
+                documents = loader.load()
+            else:
+                continue
+
+            all_docs.extend(documents)
+        except Exception as e:
+            print(f"Error loading {filename}: {e}")
+            continue
+
+    # Split into chunks
+    text_splitter = CharacterTextSplitter(
+        chunk_size=300,  # Smaller to avoid the warning
+        chunk_overlap=30
+    )
+    return text_splitter.split_documents(all_docs)
diff --git a/5-Naive RAG system/interview_qa.txt b/5-Naive RAG system/interview_qa.txt
@@ -0,0 +1,13 @@
+1- What is RAG? Giving models access to up-to-date, specific information they weren't trained on.
+
+2- Embeddings? Numbers that capture meaning. Similar texts have similar number patterns.
+
+3- Why Chroma? Lightweight, simple vector database perfect for learning and prototyping.
+
+4- Cosine similarity? Measures how similar two vectors are. Used to find the most relevant documents.
+
+5- Update knowledge? Add new documents to the vector store. The retriever will automatically include them.
+
+6- Risk of irrelevant docs? The model might use wrong information. Need good retrieval filtering.
+
+7- RAG vs fine-tuning? RAG adds knowledge without retraining. Fine-tuning teaches new skills/styles.
diff --git a/5-Naive RAG system/main.py b/5-Naive RAG system/main.py
@@ -0,0 +1,12 @@
+from rag import rag_query
+from pydantic import BaseModel
+from fastapi import FastAPI
+
+app= FastAPI()
+
+class QueryRequest(BaseModel):
+    question: str
+
+@app.post("/rag-query")
+def query_rag(request: QueryRequest):
+    return rag_query(request.question)
diff --git a/5-Naive RAG system/rag.py b/5-Naive RAG system/rag.py
@@ -0,0 +1,55 @@
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.llms import HuggingFacePipeline
+from transformers import pipeline as hf_pipeline
+from document_loader import load_documents_from_folder
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.schema import Document
+
+# Initialize embedding model
+embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+
+# Initialize LLM for answer generation
+llm_pipeline = hf_pipeline(
+    "text-generation",
+    model="distilgpt2",
+    max_new_tokens=100,
+    temperature=0.3
+)
+llm = HuggingFacePipeline(pipeline=llm_pipeline)
+
+def initialize_rag():
+    """Load documents and create vector store"""
+    documents = load_documents_from_folder("data")
+
+    if not documents:
+        # Fallback to simple examples
+        documents = [Document(page_content="No documents found in /data folder.")]
+
+    vector_store = Chroma.from_documents(documents, embeddings)
+    return vector_store.as_retriever()
+
+# Initialize when app starts
+retriever = initialize_rag()
+
+def rag_query(question: str):
+    # Use invoke() instead of get_relevant_documents()
+    relevant_docs = retriever.invoke(question)
+    context = "\n\n".join([doc.page_content for doc in relevant_docs[:3]])
+
+    prompt = f"""Use ONLY the following context to answer the question. Do not use any other knowledge:
+
+Context:
+{context}
+
+Question: {question}
+If the answer is not in the context, say "I cannot find the answer in the provided documents
+
+Answer:"""
+
+    answer = llm.invoke(prompt)
+    return {
+        "question": question,
+        "contexts": [doc.page_content for doc in relevant_docs[:2]],
+        "answer": answer
+    }
diff --git a/6-FAISS+LangChain/__pycache__/document_loader.cpython-313.pyc b/6-FAISS+LangChain/__pycache__/document_loader.cpython-313.pyc
diff --git a/6-FAISS+LangChain/__pycache__/faiss_rag.cpython-313.pyc b/6-FAISS+LangChain/__pycache__/faiss_rag.cpython-313.pyc
diff --git a/6-FAISS+LangChain/__pycache__/main.cpython-313.pyc b/6-FAISS+LangChain/__pycache__/main.cpython-313.pyc