diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/.DS_Store differ diff --git a/1.fake-news-LogisticRegression.ipynb b/1.fake-news-LogisticRegression.ipynb new file mode 100644 index 0000000..d2326aa --- /dev/null +++ b/1.fake-news-LogisticRegression.ipynb @@ -0,0 +1,863 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "id": "4dc82578", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1ff89aef", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"dataset/data.csv\")\n", + "\n", + "# remove empty rows\n", + "df = df[df['title'] != '']\n", + "df = df[df['text'] != '']" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4763a7f6", + "metadata": {}, + "outputs": [], + "source": [ + "# Remove duplicate rows based on the 'text' column\n", + "df = df.drop_duplicates(subset=['text']) \n", + "\n", + "# Remove rows with 'text' is NaN\n", + "df = df.dropna(subset=['text']) \n", + "\n", + "# Remove rows with 'label' is NaN\n", + "df = df.dropna(subset=['label']) \n", + "\n", + "# Remove rows with 'text' empty or only with whitespace\n", + "df = df[df['text'].str.strip() != ''] " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "41340b02", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] /Users/luis.guimaraes/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], + "source": [ + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "# Download required NLTK data if not already downloaded\n", + "nltk.download('wordnet')\n", + "\n", + "def clean_text(text):\n", + " text = text.lower()\n", + " text = re.sub(r'\\[.*?\\]', '', text)\n", + " text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text)\n", + " text = re.sub(r'<.*?>+', '', text)\n", + " text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)\n", + " text = re.sub(r'\\n', '', text)\n", + " text = re.sub(r'\\w*\\d\\w*', '', text)\n", + " \n", + " # Tokenize the text\n", + " tokens = word_tokenize(text)\n", + " \n", + " # Initialize Lemmatizer\n", + " lemmatizer = WordNetLemmatizer()\n", + " \n", + " # Lemmatize each token\n", + " lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]\n", + " \n", + " # Join tokens back into a string\n", + " text = ' '.join(lemmatized_tokens)\n", + " return text\n", + "\n", + "df['text_clean'] = df['title'] + \" \" + df['text']\n", + "df['text_clean'] = df['text_clean'].apply(clean_text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "296fe39d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Split the data into training and testing sets\n", + "X = df['text_clean']\n", + "y = df['label']\n", + "\n", + "# train\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a64b0b16", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "vectorizer = TfidfVectorizer(\n", + " max_features=8000, # limit the number of features\n", + " stop_words='english',\n", + " min_df=5, # ignore rare words\n", + " max_df=0.8 # ignore overly common words\n", + ")\n", + "\n", + "# Fit only on training, transform both\n", + "X_train_vec = vectorizer.fit_transform(X_train)\n", + "X_test_vec = vectorizer.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c381617c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LogisticRegression(C=0.5, class_weight='balanced', max_iter=1000, penalty='l1',\n",
+       "                   random_state=42, solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LogisticRegression(C=0.5, class_weight='balanced', max_iter=1000, penalty='l1',\n", + " random_state=42, solver='saga')" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LogisticRegression(\n", + " class_weight='balanced',\n", + " solver='saga', # Algorithm to use in the optimization problem\n", + " penalty='l1', # Specify the norm of the penalty\n", + " C=0.5, # Inverse of regularization strength; smaller values specify stronger regularization\n", + " max_iter=1000, # Maximum number of iterations taken for the solvers to converge\n", + " random_state=42 # For reproducibility\n", + ")\n", + "\n", + "model.fit(X_train_vec, y_train)\n", + "\n", + "\n", + "#model = LogisticRegression(class_weight='balanced')\n", + "#model.fit(X_train_vec, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "484d6523", + "metadata": {}, + "outputs": [], + "source": [ + "# Predict on the test set\n", + "y_pred = model.predict(X_test_vec)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5f194883", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['tfidf_vectorizer.pkl']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import joblib\n", + "\n", + "# Save model\n", + "joblib.dump(model, 'logistic_model.pkl')\n", + "\n", + "# Save TF-IDF vectorizer\n", + "joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2ef43d5f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.99 0.99 0.99 3241\n", + " 1 0.99 0.99 0.99 3954\n", + "\n", + " accuracy 0.99 7195\n", + " macro avg 0.99 0.99 0.99 7195\n", + "weighted avg 0.99 0.99 0.99 7195\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Classification report and confusion matrix\n", + "print(classification_report(y_test, y_pred))\n", + "sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')\n", + "plt.title(\"Confusion Matrix\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ab90e014", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline accuracy: 0.54954829742877\n" + ] + } + ], + "source": [ + "from sklearn.dummy import DummyClassifier\n", + "dummy = DummyClassifier(strategy=\"most_frequent\")\n", + "dummy.fit(X_train_vec, y_train)\n", + "print(\"Baseline accuracy:\", dummy.score(X_test_vec, y_test))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "a787659a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Probabilities: [[0.99015316 0.00984684]]\n" + ] + } + ], + "source": [ + "sample_text = [\"This is a sample news article.\"] # Replace with your sample text\n", + "sample_vec = vectorizer.transform(sample_text)\n", + "\n", + "probs = model.predict_proba(sample_vec)\n", + "print(\"Probabilities:\", probs)\n" + ] + }, + { + "cell_type": "markdown", + "id": "37fa4daf", + "metadata": {}, + "source": [ + "# Load validation_data.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e984f239", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] /Users/luis.guimaraes/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "\n", + "from nltk.stem import PorterStemmer\n", + "from nltk.tokenize import word_tokenize\n", + "nltk.download('punkt')\n", + "\n", + "# Define stemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "# Define stem_text function\n", + "def stem_text(text):\n", + "\tif isinstance(text, str):\n", + "\t\t# Tokenize the text\n", + "\t\ttokens = word_tokenize(text.lower())\n", + "\t\t# Apply stemming\n", + "\t\tstemmed_tokens = [stemmer.stem(token) for token in tokens]\n", + "\t\t# Join tokens back into a string\n", + "\t\treturn ' '.join(stemmed_tokens)\n", + "\treturn ''\n", + "\n", + "# Load validation data and prepare it for prediction\n", + "validation_df = pd.read_csv(\"dataset/validation_data.csv\")\n", + "\n", + "# Clean NaNs before applying\n", + "validation_df['title'] = validation_df['title'].fillna('')\n", + "validation_df['text'] = validation_df['text'].fillna('')\n", + "\n", + "# Apply stemming\n", + "validation_df['title'] = validation_df['title'].apply(stem_text)\n", + "validation_df['text'] = validation_df['text'].apply(stem_text)\n", + "\n", + "# Combine title and text\n", + "validation_df['text_clean'] = (validation_df['title'] + ' ' + validation_df['text']).str.strip()\n", + "validation_df = validation_df[validation_df['text_clean'] != '']\n", + "\n", + "# Prepare features (ignore label column as instructed)\n", + "X_val = validation_df['text_clean']\n", + "\n", + "# Transform using the same vectorizer used for training\n", + "X_val_vec = vectorizer.transform(X_val)\n", + "\n", + "# Get predictions (0 or 1)\n", + "predictions = model.predict(X_val_vec)\n", + "\n", + "# Add predictions to the validation dataframe\n", + "validation_df['predicted_label'] = predictions\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "217a1684", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First few predictions:\n", + " title predicted_label\n", + "0 uk 's may 'receiv regular updat ' on london tu... 0\n", + "1 uk transport polic lead investig of london inc... 0\n", + "2 pacif nation crack down on north korean ship a... 0\n", + "3 three suspect al qaeda milit kill in yemen dro... 0\n", + "4 chines academ prod beij to consid north korea ... 1\n", + "\n", + "Prediction counts:\n", + "predicted_label\n", + "0 4668\n", + "1 288\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Display the first few predictions\n", + "print(\"First few predictions:\")\n", + "print(validation_df[['title', 'predicted_label']].head())\n", + "\n", + "validation_df[['title', 'predicted_label']].to_csv('validation_predictions.csv', index=False)\n", + "\n", + "# Create a copy with index as id\n", + "#result_df = pd.DataFrame({\n", + "# 'id': validation_df.index,\n", + "# 'predicted_label': validation_df['predicted_label']\n", + "#})\n", + "\n", + "validation_df.to_csv('validation_predictions.csv', index=False)\n", + "\n", + "# Count of each prediction class\n", + "print(\"\\nPrediction counts:\")\n", + "print(validation_df['predicted_label'].value_counts())\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "3.10.12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/1.fake-news-RandonForest copy.ipynb b/1.fake-news-RandonForest copy.ipynb new file mode 100644 index 0000000..da31f09 --- /dev/null +++ b/1.fake-news-RandonForest copy.ipynb @@ -0,0 +1,861 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4dc82578", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1ff89aef", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"dataset/data.csv\")\n", + "\n", + "# remove empty rows\n", + "df = df[df['title'] != '']\n", + "df = df[df['text'] != '']" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4763a7f6", + "metadata": {}, + "outputs": [], + "source": [ + "# Remove duplicate rows based on the 'text' column\n", + "df = df.drop_duplicates(subset=['text']) \n", + "\n", + "# Remove rows with 'text' is NaN\n", + "df = df.dropna(subset=['text']) \n", + "\n", + "# Remove rows with 'label' is NaN\n", + "df = df.dropna(subset=['label']) \n", + "\n", + "# Remove rows with 'text' empty or only with whitespace\n", + "df = df[df['text'].str.strip() != ''] " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "41340b02", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] /Users/luis.guimaraes/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] /Users/luis.guimaraes/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "# Download required NLTK data if not already downloaded\n", + "nltk.download('wordnet')\n", + "nltk.download('punkt')\n", + "\n", + "def clean_text(text):\n", + " text = text.lower()\n", + " text = re.sub(r'\\[.*?\\]', '', text)\n", + " text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text)\n", + " text = re.sub(r'<.*?>+', '', text)\n", + " text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)\n", + " text = re.sub(r'\\n', '', text)\n", + " text = re.sub(r'\\w*\\d\\w*', '', text)\n", + " \n", + " # Tokenize the text\n", + " tokens = word_tokenize(text)\n", + " \n", + " # Initialize Lemmatizer\n", + " lemmatizer = WordNetLemmatizer()\n", + " \n", + " # Lemmatize each token\n", + " lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]\n", + " \n", + " # Join tokens back into a string\n", + " text = ' '.join(lemmatized_tokens)\n", + " return text\n", + "\n", + "df['text_clean'] = df['title'] + \" \" + df['text']\n", + "df['text_clean'] = df['text_clean'].apply(clean_text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "296fe39d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Split the data into training and testing sets\n", + "X = df['text_clean']\n", + "y = df['label']\n", + "\n", + "# train\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a64b0b16", + "metadata": {}, + "outputs": [], + "source": [ + "#vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')\n", + "vectorizer = TfidfVectorizer(\n", + " max_features=8000, # limit the number of features\n", + " stop_words='english',\n", + " min_df=5, # ignore rare words\n", + " max_df=0.8 # ignore overly common words\n", + ")\n", + "\n", + "# Fit only on training, transform both\n", + "X_train_vec = vectorizer.fit_transform(X_train)\n", + "X_test_vec = vectorizer.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c381617c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "model = RandomForestClassifier(\n", + " n_estimators=100, # Number of trees\n", + " max_depth=None, # Maximum depth of trees (None means unlimited)\n", + " min_samples_split=2,\n", + " min_samples_leaf=1,\n", + " class_weight='balanced', # Same as your Logistic Regression\n", + " random_state=42, # For reproducibility\n", + " n_jobs=-1 # Use all available cores\n", + ")\n", + "\n", + "# Train the model (this stays the same)\n", + "model.fit(X_train_vec, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "484d6523", + "metadata": {}, + "outputs": [], + "source": [ + "# Predict on the test set\n", + "y_pred = model.predict(X_test_vec)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5f194883", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['tfidf_vectorizer_rf.pkl']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import joblib\n", + "\n", + "# Save model\n", + "joblib.dump(model, 'random_forest_model.pkl')\n", + "\n", + "# Save TF-IDF vectorizer (this stays the same)\n", + "joblib.dump(vectorizer, 'tfidf_vectorizer_rf.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2ef43d5f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.99 0.99 3241\n", + " 1 0.99 1.00 1.00 3954\n", + "\n", + " accuracy 1.00 7195\n", + " macro avg 1.00 1.00 1.00 7195\n", + "weighted avg 1.00 1.00 1.00 7195\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Classification report and confusion matrix\n", + "print(classification_report(y_test, y_pred))\n", + "sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')\n", + "plt.title(\"Confusion Matrix\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ab90e014", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline accuracy: 0.54954829742877\n" + ] + } + ], + "source": [ + "from sklearn.dummy import DummyClassifier\n", + "dummy = DummyClassifier(strategy=\"most_frequent\")\n", + "dummy.fit(X_train_vec, y_train)\n", + "print(\"Baseline accuracy:\", dummy.score(X_test_vec, y_test))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a787659a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Probabilities: [[0.99 0.01]]\n" + ] + } + ], + "source": [ + "sample_text = [\"This is a sample news article.\"] # Replace with your sample text\n", + "sample_vec = vectorizer.transform(sample_text)\n", + "\n", + "probs = model.predict_proba(sample_vec)\n", + "print(\"Probabilities:\", probs)\n" + ] + }, + { + "cell_type": "markdown", + "id": "37fa4daf", + "metadata": {}, + "source": [ + "# Load validation_data.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e984f239", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] /Users/luis.guimaraes/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "\n", + "# Import stemmer from nltk\n", + "from nltk.stem import PorterStemmer\n", + "nltk.download('punkt') # Need this for word_tokenize\n", + "\n", + "# Initialize stemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "# Define stem_text function\n", + "def stem_text(text):\n", + "\tif isinstance(text, str):\n", + "\t\t# Tokenize the text\n", + "\t\ttokens = word_tokenize(text.lower())\n", + "\t\t# Apply stemming\n", + "\t\tstemmed_tokens = [stemmer.stem(token) for token in tokens]\n", + "\t\t# Join tokens back into a string\n", + "\t\treturn ' '.join(stemmed_tokens)\n", + "\treturn ''\n", + "\n", + "# Load validation data and prepare it for prediction\n", + "validation_df = pd.read_csv(\"dataset/validation_data.csv\")\n", + "\n", + "# Clean NaNs before applying\n", + "validation_df['title'] = validation_df['title'].fillna('')\n", + "validation_df['text'] = validation_df['text'].fillna('')\n", + "\n", + "# Apply stemming\n", + "validation_df['title'] = validation_df['title'].apply(stem_text)\n", + "validation_df['text'] = validation_df['text'].apply(stem_text)\n", + "\n", + "# Combine title and text\n", + "validation_df['text_clean'] = (validation_df['title'] + ' ' + validation_df['text']).str.strip()\n", + "validation_df = validation_df[validation_df['text_clean'] != '']\n", + "\n", + "# Prepare features (ignore label column as instructed)\n", + "X_val = validation_df['text_clean']\n", + "\n", + "# Transform using the same vectorizer used for training\n", + "X_val_vec = vectorizer.transform(X_val)\n", + "\n", + "# Get predictions (0 or 1)\n", + "predictions = model.predict(X_val_vec)\n", + "\n", + "# Add predictions to the validation dataframe\n", + "validation_df['predicted_label'] = predictions\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "217a1684", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First few predictions:\n", + " title predicted_label\n", + "0 uk 's may 'receiv regular updat ' on london tu... 1\n", + "1 uk transport polic lead investig of london inc... 0\n", + "2 pacif nation crack down on north korean ship a... 1\n", + "3 three suspect al qaeda milit kill in yemen dro... 1\n", + "4 chines academ prod beij to consid north korea ... 1\n", + "\n", + "Prediction counts:\n", + "predicted_label\n", + "0 4207\n", + "1 749\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Display the first few predictions\n", + "print(\"First few predictions:\")\n", + "print(validation_df[['title', 'predicted_label']].head())\n", + "\n", + "validation_df[['title', 'predicted_label']].to_csv('validation_predictions.csv', index=False)\n", + "\n", + "# Create a copy with index as id\n", + "#result_df = pd.DataFrame({\n", + "# 'id': validation_df.index,\n", + "# 'predicted_label': validation_df['predicted_label']\n", + "#})\n", + "\n", + "validation_df.to_csv('validation_predictions-rf.csv', index=False)\n", + "\n", + "# Count of each prediction class\n", + "print(\"\\nPrediction counts:\")\n", + "print(validation_df['predicted_label'].value_counts())\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "3.10.12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/1.fake-news-XGBoost.ipynb b/1.fake-news-XGBoost.ipynb new file mode 100644 index 0000000..6781eda --- /dev/null +++ b/1.fake-news-XGBoost.ipynb @@ -0,0 +1,859 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4dc82578", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1ff89aef", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"dataset/data.csv\")\n", + "\n", + "# remove empty rows\n", + "df = df[df['title'] != '']\n", + "df = df[df['text'] != '']" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4763a7f6", + "metadata": {}, + "outputs": [], + "source": [ + "# Remove duplicate rows based on the 'text' column\n", + "df = df.drop_duplicates(subset=['text']) \n", + "\n", + "# Remove rows with 'text' is NaN\n", + "df = df.dropna(subset=['text']) \n", + "\n", + "# Remove rows with 'label' is NaN\n", + "df = df.dropna(subset=['label']) \n", + "\n", + "# Remove rows with 'text' empty or only with whitespace\n", + "df = df[df['text'].str.strip() != ''] " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "41340b02", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] /Users/luis.guimaraes/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], + "source": [ + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "# Download required NLTK data if not already downloaded\n", + "nltk.download('wordnet')\n", + "\n", + "def clean_text(text):\n", + " text = text.lower()\n", + " text = re.sub(r'\\[.*?\\]', '', text)\n", + " text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text)\n", + " text = re.sub(r'<.*?>+', '', text)\n", + " text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)\n", + " text = re.sub(r'\\n', '', text)\n", + " text = re.sub(r'\\w*\\d\\w*', '', text)\n", + " \n", + " # Tokenize the text\n", + " tokens = word_tokenize(text)\n", + " \n", + " # Initialize Lemmatizer\n", + " lemmatizer = WordNetLemmatizer()\n", + " \n", + " # Lemmatize each token\n", + " lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]\n", + " \n", + " # Join tokens back into a string\n", + " text = ' '.join(lemmatized_tokens)\n", + " return text\n", + "\n", + "df['text_clean'] = df['title'] + \" \" + df['text']\n", + "df['text_clean'] = df['text_clean'].apply(clean_text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "296fe39d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Split the data into training and testing sets\n", + "X = df['text_clean']\n", + "y = df['label']\n", + "\n", + "# train\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a64b0b16", + "metadata": {}, + "outputs": [], + "source": [ + "#vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')\n", + "vectorizer = TfidfVectorizer(\n", + " max_features=8000, # limit the number of features\n", + " stop_words='english',\n", + " min_df=5, # ignore rare words\n", + " max_df=0.8 # ignore overly common words\n", + ")\n", + "\n", + "# Fit only on training, transform both\n", + "X_train_vec = vectorizer.fit_transform(X_train)\n", + "X_test_vec = vectorizer.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c381617c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Install XGBoost if needed\n", + "!pip install xgboost\n", + "\n", + "model = RandomForestClassifier(\n", + " n_estimators=100, # Number of trees\n", + " max_depth=None, # Maximum depth of trees (None means unlimited)\n", + " min_samples_split=2,\n", + " min_samples_leaf=1,\n", + " class_weight='balanced', # Same as your Logistic Regression\n", + " random_state=42, # For reproducibility\n", + " n_jobs=-1 # Use all available cores\n", + ")\n", + "\n", + "# Train the model (this stays the same)\n", + "model.fit(X_train_vec, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "484d6523", + "metadata": {}, + "outputs": [], + "source": [ + "# Predict on the test set\n", + "y_pred = model.predict(X_test_vec)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5f194883", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['tfidf_vectorizer_rf.pkl']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import joblib\n", + "\n", + "# Save model\n", + "joblib.dump(model, 'random_forest_model.pkl')\n", + "\n", + "# Save TF-IDF vectorizer (this stays the same)\n", + "joblib.dump(vectorizer, 'tfidf_vectorizer_rf.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2ef43d5f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.99 0.99 3241\n", + " 1 0.99 1.00 1.00 3954\n", + "\n", + " accuracy 1.00 7195\n", + " macro avg 1.00 1.00 1.00 7195\n", + "weighted avg 1.00 1.00 1.00 7195\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Classification report and confusion matrix\n", + "print(classification_report(y_test, y_pred))\n", + "sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')\n", + "plt.title(\"Confusion Matrix\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ab90e014", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline accuracy: 0.54954829742877\n" + ] + } + ], + "source": [ + "from sklearn.dummy import DummyClassifier\n", + "dummy = DummyClassifier(strategy=\"most_frequent\")\n", + "dummy.fit(X_train_vec, y_train)\n", + "print(\"Baseline accuracy:\", dummy.score(X_test_vec, y_test))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a787659a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Probabilities: [[0.99 0.01]]\n" + ] + } + ], + "source": [ + "sample_text = [\"This is a sample news article.\"] # Replace with your sample text\n", + "sample_vec = vectorizer.transform(sample_text)\n", + "\n", + "probs = model.predict_proba(sample_vec)\n", + "print(\"Probabilities:\", probs)\n" + ] + }, + { + "cell_type": "markdown", + "id": "37fa4daf", + "metadata": {}, + "source": [ + "# Load validation_data.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e984f239", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] /Users/luis.guimaraes/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "\n", + "# Import stemmer from nltk\n", + "from nltk.stem import PorterStemmer\n", + "nltk.download('punkt') # Need this for word_tokenize\n", + "\n", + "# Initialize stemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "# Define stem_text function\n", + "def stem_text(text):\n", + "\tif isinstance(text, str):\n", + "\t\t# Tokenize the text\n", + "\t\ttokens = word_tokenize(text.lower())\n", + "\t\t# Apply stemming\n", + "\t\tstemmed_tokens = [stemmer.stem(token) for token in tokens]\n", + "\t\t# Join tokens back into a string\n", + "\t\treturn ' '.join(stemmed_tokens)\n", + "\treturn ''\n", + "\n", + "# Load validation data and prepare it for prediction\n", + "validation_df = pd.read_csv(\"dataset/validation_data.csv\")\n", + "\n", + "# Clean NaNs before applying\n", + "validation_df['title'] = validation_df['title'].fillna('')\n", + "validation_df['text'] = validation_df['text'].fillna('')\n", + "\n", + "# Apply stemming\n", + "validation_df['title'] = validation_df['title'].apply(stem_text)\n", + "validation_df['text'] = validation_df['text'].apply(stem_text)\n", + "\n", + "# Combine title and text\n", + "validation_df['text_clean'] = (validation_df['title'] + ' ' + validation_df['text']).str.strip()\n", + "validation_df = validation_df[validation_df['text_clean'] != '']\n", + "\n", + "# Prepare features (ignore label column as instructed)\n", + "X_val = validation_df['text_clean']\n", + "\n", + "# Transform using the same vectorizer used for training\n", + "X_val_vec = vectorizer.transform(X_val)\n", + "\n", + "# Get predictions (0 or 1)\n", + "predictions = model.predict(X_val_vec)\n", + "\n", + "# Add predictions to the validation dataframe\n", + "validation_df['predicted_label'] = predictions\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "217a1684", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First few predictions:\n", + " title predicted_label\n", + "0 uk 's may 'receiv regular updat ' on london tu... 1\n", + "1 uk transport polic lead investig of london inc... 1\n", + "2 pacif nation crack down on north korean ship a... 1\n", + "3 three suspect al qaeda milit kill in yemen dro... 1\n", + "4 chines academ prod beij to consid north korea ... 1\n", + "\n", + "Prediction counts:\n", + "predicted_label\n", + "0 3688\n", + "1 1268\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Display the first few predictions\n", + "print(\"First few predictions:\")\n", + "print(validation_df[['title', 'predicted_label']].head())\n", + "\n", + "validation_df[['title', 'predicted_label']].to_csv('validation_predictions.csv', index=False)\n", + "\n", + "# Create a copy with index as id\n", + "#result_df = pd.DataFrame({\n", + "# 'id': validation_df.index,\n", + "# 'predicted_label': validation_df['predicted_label']\n", + "#})\n", + "\n", + "validation_df.to_csv('validation_predictions-rf.csv', index=False)\n", + "\n", + "# Count of each prediction class\n", + "print(\"\\nPrediction counts:\")\n", + "print(validation_df['predicted_label'].value_counts())\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "3.10.12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..980e6d2 --- /dev/null +++ b/main.py @@ -0,0 +1,182 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.pipeline import Pipeline +from sklearn.dummy import DummyClassifier +import seaborn as sns +import matplotlib.pyplot as plt +import re +import string +import nltk +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.pipeline import Pipeline +from sklearn.dummy import DummyClassifier +import seaborn as sns +import matplotlib.pyplot as plt +import re +import string +import nltk +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer +import joblib +import os + +# Download required NLTK data +nltk.download('wordnet') +nltk.download('punkt') + +# Custom text preprocessor class +class TextPreprocessor(BaseEstimator, TransformerMixin): + def __init__(self): + self.lemmatizer = WordNetLemmatizer() + + def clean_text(self, text): + text = text.lower() + text = re.sub(r'\[.*?\]', '', text) + text = re.sub(r'http\S+|www\S+|https\S+', '', text) + text = re.sub(r'<.*?>+', '', text) + text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) + text = re.sub(r'\n', '', text) + text = re.sub(r'\w*\d\w*', '', text) + + tokens = word_tokenize(text) + lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens] + return ' '.join(lemmatized_tokens) + + def fit(self, X, y=None): + return self + + def transform(self, X): + # Check if 'cleaned_text' column exists, if so, return it directly + if isinstance(X, pd.DataFrame) and 'cleaned_text' in X.columns: + return X['cleaned_text'] + + # Otherwise, perform cleaning + if isinstance(X, pd.DataFrame): + # Combine title and text for DataFrame input + return (X['title'] + " " + X['text']).apply(self.clean_text) + return X.apply(self.clean_text) + +# Define cache file paths +TRAIN_CACHE_PATH = 'dataset/train_cleaned.parquet' +TEST_CACHE_PATH = 'dataset/test_cleaned.parquet' + +# Load and prepare base data +df = pd.read_csv("dataset/data.csv") + +# Data cleaning +df = df[ + (df['title'] != '') & + (df['text'] != '') & + (df['text'].str.strip() != '') +].drop_duplicates(subset=['text']).dropna(subset=['text', 'label']) + +# Split data FIRST to prevent leakage +train_df, test_df = train_test_split( + df[['title', 'text', 'label']], # Keep raw text for pipeline processing + test_size=0.2, + random_state=42, + stratify=df['label'] +) + +print("\nTraining set class distribution:") +print(train_df['label'].value_counts()) +print("\nTest set class distribution:") +print(test_df['label'].value_counts()) + +# Check for cached cleaned data +if os.path.exists(TRAIN_CACHE_PATH) and os.path.exists(TEST_CACHE_PATH): + print("Loading cleaned data from cache...") + train_df = pd.read_parquet(TRAIN_CACHE_PATH) + test_df = pd.read_parquet(TEST_CACHE_PATH) + print("Cleaned data loaded.") +else: + print("Cleaning data and saving to cache...") + # Apply cleaning and add 'cleaned_text' column + preprocessor = TextPreprocessor() + train_df['cleaned_text'] = preprocessor.transform(train_df) + test_df['cleaned_text'] = preprocessor.transform(test_df) + + # Save cleaned data to parquet + train_df.to_parquet(TRAIN_CACHE_PATH, index=False) + test_df.to_parquet(TEST_CACHE_PATH, index=False) + print("Cleaned data saved to cache.") + + +# Create preprocessing pipeline +# The TextPreprocessor will now use the 'cleaned_text' column if it exists +pipeline = Pipeline([ + ('preprocessor', TextPreprocessor()), + ('vectorizer', TfidfVectorizer( + max_features=8000, + stop_words='english', + min_df=5, + max_df=0.8 + )), + ('classifier', LogisticRegression( + class_weight='balanced', + solver='saga', + penalty='l1', + C=0.5, + max_iter=1000, + random_state=42 + )) +]) + +# Train model +# The pipeline will now use the 'cleaned_text' column from the loaded dataframes +pipeline.fit(train_df, train_df['label']) + +# Evaluate +y_pred = pipeline.predict(test_df) +print("Evaluation on Test Set:") +print(classification_report(test_df['label'], y_pred)) +sns.heatmap(confusion_matrix(test_df['label'], y_pred), annot=True, fmt='d') +plt.title("Confusion Matrix (Test Set)") +plt.show() + +# Evaluate on Training Set +print("\nEvaluation on Training Set:") +y_train_pred = pipeline.predict(train_df) +print(classification_report(train_df['label'], y_train_pred)) +sns.heatmap(confusion_matrix(train_df['label'], y_train_pred), annot=True, fmt='d') +plt.title("Confusion Matrix (Training Set)") +plt.show() + +# Save entire pipeline +joblib.dump(pipeline, 'text_classification_pipeline.pkl') + +# Baseline evaluation +dummy = DummyClassifier(strategy="most_frequent") +# Use the original text columns for baseline evaluation +dummy.fit(train_df[['title', 'text']], train_df['label']) +print("Baseline accuracy:", dummy.score(test_df[['title', 'text']], test_df['label'])) + +# Validation processing (using same lemmatization) +validation_df = pd.read_csv("dataset/validation_data.csv").fillna({'title': '', 'text': ''}) +# The TextPreprocessor will clean the validation data as it's not cached +validation_df['cleaned_text'] = TextPreprocessor().transform(validation_df) +print("Validation data processed.") + +print("Generating predictions for validation data...") +# Predict using the cleaned text column +predictions = pipeline.predict(validation_df) +validation_df['predicted_label'] = predictions +print("Predictions generated.") + +# Save results +print("Saving validation predictions...") +validation_df[['title', 'predicted_label']].to_csv('validation_predictions.csv', index=False) +print("Validation predictions saved.") +print("\nPrediction counts:") +print(validation_df['predicted_label'].value_counts())