ironhack-labs · jk143kaur · Sep 11, 2025
diff --git a/Nlpchallenge.ipynb b/Nlpchallenge.ipynb
@@ -0,0 +1,267 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "990c7b3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data handling\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Text preprocessing\n",
+    "import re\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "# Model selection\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
+    "\n",
+    "# Save predictions\n",
+    "import os\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "91b9424d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   label                                              title  \\\n",
+      "0      1  As U.S. budget fight looms, Republicans flip t...   \n",
+      "1      1  U.S. military to accept transgender recruits o...   \n",
+      "2      1  Senior U.S. Republican senator: 'Let Mr. Muell...   \n",
+      "3      1  FBI Russia probe helped by Australian diplomat...   \n",
+      "4      1  Trump wants Postal Service to charge 'much mor...   \n",
+      "\n",
+      "                                                text       subject  \\\n",
+      "0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   \n",
+      "1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   \n",
+      "2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   \n",
+      "3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   \n",
+      "4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   \n",
+      "\n",
+      "                 date  \n",
+      "0  December 31, 2017   \n",
+      "1  December 29, 2017   \n",
+      "2  December 31, 2017   \n",
+      "3  December 30, 2017   \n",
+      "4  December 29, 2017   \n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 39942 entries, 0 to 39941\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column   Non-Null Count  Dtype \n",
+      "---  ------   --------------  ----- \n",
+      " 0   label    39942 non-null  int64 \n",
+      " 1   title    39942 non-null  object\n",
+      " 2   text     39942 non-null  object\n",
+      " 3   subject  39942 non-null  object\n",
+      " 4   date     39942 non-null  object\n",
+      "dtypes: int64(1), object(4)\n",
+      "memory usage: 1.5+ MB\n",
+      "None\n",
+      "label\n",
+      "1    19999\n",
+      "0    19943\n",
+      "Name: count, dtype: int64\n",
+      "label      0\n",
+      "title      0\n",
+      "text       0\n",
+      "subject    0\n",
+      "date       0\n",
+      "dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load training dataset\n",
+    "data = pd.read_csv(\"dataset/data.csv\")\n",
+    "\n",
+    "# Quick look\n",
+    "print(data.head())\n",
+    "print(data.info())\n",
+    "print(data['label'].value_counts())\n",
+    "print(data.isnull().sum())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d0952a95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_text(text):\n",
+    "    text = text.lower()\n",
+    "    text = re.sub(r\"http\\S+|www\\S+\", \"\", text)   # remove URLs\n",
+    "    text = re.sub(r\"[^a-z\\s]\", \"\", text)         # remove punctuation and numbers\n",
+    "    text = re.sub(r\"\\s+\", \" \", text).strip()     # remove extra whitespace\n",
+    "    return text\n",
+    "\n",
+    "# Apply to titles or text (or both)\n",
+    "data['clean_text'] = data['title'] + \" \" + data['text']\n",
+    "data['clean_text'] = data['clean_text'].apply(clean_text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4d3a139e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = data['clean_text']\n",
+    "y = data['label']\n",
+    "\n",
+    "# Train-test split (80/20)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, y, test_size=0.2, random_state=42, stratify=y\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d1b206c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))\n",
+    "X_train_vec = vectorizer.fit_transform(X_train)\n",
+    "X_test_vec = vectorizer.transform(X_test)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "82594345",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.9892351983977969\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.99      0.99      0.99      3989\n",
+      "           1       0.99      0.99      0.99      4000\n",
+      "\n",
+      "    accuracy                           0.99      7989\n",
+      "   macro avg       0.99      0.99      0.99      7989\n",
+      "weighted avg       0.99      0.99      0.99      7989\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = LogisticRegression(max_iter=1000)\n",
+    "model.fit(X_train_vec, y_train)\n",
+    "\n",
+    "# Predict on test set\n",
+    "y_pred = model.predict(X_test_vec)\n",
+    "\n",
+    "# Evaluate\n",
+    "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
+    "print(classification_report(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "acb02584",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load validation data\n",
+    "val_data = pd.read_csv(\"dataset/validation_data.csv\")\n",
+    "\n",
+    "# Clean text\n",
+    "val_data['clean_text'] = (val_data['title'] + \" \" + val_data['text']).apply(clean_text)\n",
+    "\n",
+    "# Vectorize\n",
+    "X_val_vec = vectorizer.transform(val_data['clean_text'])\n",
+    "\n",
+    "# Predict labels\n",
+    "val_pred = model.predict(X_val_vec)\n",
+    "\n",
+    "# Replace label 2 with predictions (if any)\n",
+    "val_data['label'] = val_pred\n",
+    "\n",
+    "# Save predictions\n",
+    "val_data[['label']].to_csv(\"predictions.csv\", index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "71fced5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Estimated Accuracy: 0.9892351983977969\n",
+      "\n",
+      "Classification Report:\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.99      0.99      0.99      3989\n",
+      "           1       0.99      0.99      0.99      4000\n",
+      "\n",
+      "    accuracy                           0.99      7989\n",
+      "   macro avg       0.99      0.99      0.99      7989\n",
+      "weighted avg       0.99      0.99      0.99      7989\n",
+      "\n",
+      "\n",
+      "Confusion Matrix:\n",
+      " [[3941   48]\n",
+      " [  38 3962]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
+    "\n",
+    "# Overall accuracy\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "print(\"Estimated Accuracy:\", accuracy)\n",
+    "\n",
+    "# Detailed report\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n",
+    "\n",
+    "# Confusion matrix\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}