Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
267 changes: 267 additions & 0 deletions Nlpchallenge.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "990c7b3b",
"metadata": {},
"outputs": [],
"source": [
"# Data handling\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# Text preprocessing\n",
"import re\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"# Model selection\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"\n",
"# Save predictions\n",
"import os\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "91b9424d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" label title \\\n",
"0 1 As U.S. budget fight looms, Republicans flip t... \n",
"1 1 U.S. military to accept transgender recruits o... \n",
"2 1 Senior U.S. Republican senator: 'Let Mr. Muell... \n",
"3 1 FBI Russia probe helped by Australian diplomat... \n",
"4 1 Trump wants Postal Service to charge 'much mor... \n",
"\n",
" text subject \\\n",
"0 WASHINGTON (Reuters) - The head of a conservat... politicsNews \n",
"1 WASHINGTON (Reuters) - Transgender people will... politicsNews \n",
"2 WASHINGTON (Reuters) - The special counsel inv... politicsNews \n",
"3 WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews \n",
"4 SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews \n",
"\n",
" date \n",
"0 December 31, 2017 \n",
"1 December 29, 2017 \n",
"2 December 31, 2017 \n",
"3 December 30, 2017 \n",
"4 December 29, 2017 \n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 39942 entries, 0 to 39941\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 label 39942 non-null int64 \n",
" 1 title 39942 non-null object\n",
" 2 text 39942 non-null object\n",
" 3 subject 39942 non-null object\n",
" 4 date 39942 non-null object\n",
"dtypes: int64(1), object(4)\n",
"memory usage: 1.5+ MB\n",
"None\n",
"label\n",
"1 19999\n",
"0 19943\n",
"Name: count, dtype: int64\n",
"label 0\n",
"title 0\n",
"text 0\n",
"subject 0\n",
"date 0\n",
"dtype: int64\n"
]
}
],
"source": [
"# Load training dataset\n",
"data = pd.read_csv(\"dataset/data.csv\")\n",
"\n",
"# Quick look\n",
"print(data.head())\n",
"print(data.info())\n",
"print(data['label'].value_counts())\n",
"print(data.isnull().sum())\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d0952a95",
"metadata": {},
"outputs": [],
"source": [
"def clean_text(text):\n",
" text = text.lower()\n",
" text = re.sub(r\"http\\S+|www\\S+\", \"\", text) # remove URLs\n",
" text = re.sub(r\"[^a-z\\s]\", \"\", text) # remove punctuation and numbers\n",
" text = re.sub(r\"\\s+\", \" \", text).strip() # remove extra whitespace\n",
" return text\n",
"\n",
"# Apply to titles or text (or both)\n",
"data['clean_text'] = data['title'] + \" \" + data['text']\n",
"data['clean_text'] = data['clean_text'].apply(clean_text)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4d3a139e",
"metadata": {},
"outputs": [],
"source": [
"X = data['clean_text']\n",
"y = data['label']\n",
"\n",
"# Train-test split (80/20)\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.2, random_state=42, stratify=y\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d1b206c8",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))\n",
"X_train_vec = vectorizer.fit_transform(X_train)\n",
"X_test_vec = vectorizer.transform(X_test)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "82594345",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9892351983977969\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.99 0.99 3989\n",
" 1 0.99 0.99 0.99 4000\n",
"\n",
" accuracy 0.99 7989\n",
" macro avg 0.99 0.99 0.99 7989\n",
"weighted avg 0.99 0.99 0.99 7989\n",
"\n"
]
}
],
"source": [
"model = LogisticRegression(max_iter=1000)\n",
"model.fit(X_train_vec, y_train)\n",
"\n",
"# Predict on test set\n",
"y_pred = model.predict(X_test_vec)\n",
"\n",
"# Evaluate\n",
"print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
"print(classification_report(y_test, y_pred))\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "acb02584",
"metadata": {},
"outputs": [],
"source": [
"# Load validation data\n",
"val_data = pd.read_csv(\"dataset/validation_data.csv\")\n",
"\n",
"# Clean text\n",
"val_data['clean_text'] = (val_data['title'] + \" \" + val_data['text']).apply(clean_text)\n",
"\n",
"# Vectorize\n",
"X_val_vec = vectorizer.transform(val_data['clean_text'])\n",
"\n",
"# Predict labels\n",
"val_pred = model.predict(X_val_vec)\n",
"\n",
"# Replace label 2 with predictions (if any)\n",
"val_data['label'] = val_pred\n",
"\n",
"# Save predictions\n",
"val_data[['label']].to_csv(\"predictions.csv\", index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "71fced5c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Estimated Accuracy: 0.9892351983977969\n",
"\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.99 0.99 0.99 3989\n",
" 1 0.99 0.99 0.99 4000\n",
"\n",
" accuracy 0.99 7989\n",
" macro avg 0.99 0.99 0.99 7989\n",
"weighted avg 0.99 0.99 0.99 7989\n",
"\n",
"\n",
"Confusion Matrix:\n",
" [[3941 48]\n",
" [ 38 3962]]\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"\n",
"# Overall accuracy\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print(\"Estimated Accuracy:\", accuracy)\n",
"\n",
"# Detailed report\n",
"print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n",
"\n",
"# Confusion matrix\n",
"print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading