diff --git a/python/examples/SVMRecipeClassifier/README.md b/python/examples/SVMRecipeClassifier/README.md new file mode 100644 index 00000000..f5b9e560 --- /dev/null +++ b/python/examples/SVMRecipeClassifier/README.md @@ -0,0 +1,43 @@ +# Binary Recipe Classifier using LIBSVM + +This project implements a binary recipe classifier using LIBSVM. It classifies recipes as either Italian or Mexican cuisine based on their ingredients. It serves as a demonstration of applying Support Vector Machines (SVM) to text classification tasks, specifically in the domain of recipe categorization. + +## Key Features + +- Binary classification of recipes (Italian vs Mexican) +- Utilizes LIBSVM for efficient SVM implementation +- Preprocesses text-based recipe data into numerical features +- Includes a comprehensive test suite for validation + +## Requirements + +- Python 3.7+ +- NumPy +- SciPy +- LIBSVM + + +## Usage + +To use the RecipeClassifier in your Python script: + +```python +from recipe_classifier import RecipeClassifier + +# Initialize the classifier +classifier = RecipeClassifier() + +# Train the classifier +recipes = [ + "pasta tomato basil olive_oil garlic", + "tortilla beans salsa avocado cilantro", + "pizza cheese tomato oregano", + "tacos beef lettuce cheese salsa" +] +cuisines = ["Italian", "Mexican", "Italian", "Mexican"] +classifier.train(recipes, cuisines) + +# Make predictions +new_recipes = ["lasagna pasta cheese tomato_sauce beef", "burrito rice beans salsa guacamole"] +predictions = classifier.predict(new_recipes) +print(predictions) \ No newline at end of file diff --git a/python/examples/SVMRecipeClassifier/recipe_classifier.py b/python/examples/SVMRecipeClassifier/recipe_classifier.py new file mode 100644 index 00000000..31797dde --- /dev/null +++ b/python/examples/SVMRecipeClassifier/recipe_classifier.py @@ -0,0 +1,152 @@ +""" +Recipe Classifier using LIBSVM + +This module implements a binary classifier for Italian and Mexican recipes +using LIBSVM. It's intended as a demonstration of how to use LIBSVM for +text classification tasks. + +WARNING: Due to the extremely small dataset, this model overfits and does not +generalize well. This implementation is for demonstration purposes only and +should not be used for real-world applications without significant modifications. +""" + +from libsvm.svmutil import * +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix +import warnings + +class RecipeClassifier: + """A binary classifier for Italian and Mexican recipes using LIBSVM.""" + + def __init__(self): + """Initialize the RecipeClassifier.""" + self.model = None + self.vocabulary = None + + def preprocess_data(self, recipes, cuisines): + """ + Preprocess the recipe data for LIBSVM. + + Args: + recipes (list): List of recipe ingredient strings. + cuisines (list): List of cuisine labels ('Italian' or 'Mexican'). + + Returns: + tuple: (X, y) where X is a sparse matrix of features and y is an array of labels. + """ + # Create vocabulary + if self.vocabulary is None: + all_ingredients = set(' '.join(recipes).split()) + self.vocabulary = {ingredient: idx for idx, ingredient in enumerate(all_ingredients)} + + # Convert recipes to feature vectors + rows, cols, data = [], [], [] + for idx, recipe in enumerate(recipes): + for ingredient in recipe.split(): + if ingredient in self.vocabulary: + rows.append(idx) + cols.append(self.vocabulary[ingredient]) + data.append(1) + + X = csr_matrix((data, (rows, cols)), shape=(len(recipes), len(self.vocabulary))) + y = np.array([1 if cuisine == 'Italian' else -1 for cuisine in cuisines]) + return X, y + + def train(self, recipes, cuisines): + """ + Train the SVM model. + + Args: + recipes (list): List of recipe ingredient strings. + cuisines (list): List of cuisine labels ('Italian' or 'Mexican'). + """ + if len(recipes) < 20: + warnings.warn("The dataset is very small. The model is likely to overfit.") + + X, y = self.preprocess_data(recipes, cuisines) + + # Split data into training and validation sets + np.random.seed(42) + indices = np.random.permutation(len(recipes)) + split = int(0.8 * len(recipes)) + train_idx, val_idx = indices[:split], indices[split:] + + X_train, y_train = X[train_idx], y[train_idx] + X_val, y_val = X[val_idx], y[val_idx] + + # Convert to LIBSVM format + prob = svm_problem(y_train.tolist(), X_train.toarray().tolist()) + param = svm_parameter('-t 0 -c 0.1') # Linear kernel, C=0.1 for less overfitting + self.model = svm_train(prob, param) + + # Validate the model + p_labels, _, _ = svm_predict(y_val.tolist(), X_val.toarray().tolist(), self.model) + accuracy = sum(1 for i, j in zip(p_labels, y_val) if i == j) / len(y_val) + print(f"Validation Accuracy: {accuracy:.2f}") + + if accuracy == 1.0: + warnings.warn("Perfect validation accuracy suggests overfitting.") + + def predict(self, new_recipes): + """ + Predict cuisines for new recipes. + + Args: + new_recipes (list): List of new recipe ingredient strings. + + Returns: + list: Predicted cuisines ('Italian' or 'Mexican'). + """ + if self.model is None: + raise ValueError("Model has not been trained. Call train() first.") + + X, _ = self.preprocess_data(new_recipes, [None] * len(new_recipes)) + p_labels, _, _ = svm_predict([0] * X.shape[0], X.toarray().tolist(), self.model) + return ['Italian' if label > 0 else 'Mexican' for label in p_labels] + + +def main(): + """Demonstrate the usage of RecipeClassifier.""" + classifier = RecipeClassifier() + + # Sample data + recipes = [ + "pasta tomato basil olive_oil garlic", + "tortilla beans salsa avocado cilantro", + "spaghetti meatballs tomato_sauce parmesan", + "tacos beef lettuce cheese salsa", + "pizza mozzarella tomato basil oregano", + "enchiladas chicken cheese salsa corn", + "lasagna pasta beef tomato cheese", + "quesadilla tortilla cheese beans salsa", + "risotto rice parmesan white_wine", + "guacamole avocado lime cilantro onion" + ] + cuisines = ["Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican"] + + # Train the model + classifier.train(recipes, cuisines) + + # Predict new recipes + new_recipes = [ + "pizza cheese tomato basil oregano", + "burrito rice beans salsa guacamole" + ] + predictions = classifier.predict(new_recipes) + print("Predictions for new recipes:", predictions) + + # Evaluate on training data + train_predictions = classifier.predict(recipes) + accuracy = sum(1 for pred, true in zip(train_predictions, cuisines) if pred == true) / len(cuisines) + print(f"Training Accuracy: {accuracy:.2f}") + + print("\nWARNING: This model is overfitting due to the small dataset.") + print("For a real-world application, consider the following improvements:") + print("1. Collect a much larger and more diverse dataset.") + print("2. Use cross-validation for more robust evaluation.") + print("3. Implement feature engineering specific to recipe classification.") + print("4. Experiment with different ML algorithms and hyperparameters.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/python/examples/SVMRecipeClassifier/test_recipe_classifier.py b/python/examples/SVMRecipeClassifier/test_recipe_classifier.py new file mode 100644 index 00000000..91ab3b85 --- /dev/null +++ b/python/examples/SVMRecipeClassifier/test_recipe_classifier.py @@ -0,0 +1,195 @@ +""" +Test module for RecipeClassifier + +This module contains unit tests for the RecipeClassifier class, which implements +a binary classifier for Italian and Mexican recipes using LIBSVM. + +The tests cover the initialization, data preprocessing, training, and prediction +functionalities of the RecipeClassifier. + +Note: These tests assume a small dataset and are meant for demonstration purposes. +In a real-world scenario, more comprehensive tests with larger datasets would be necessary. +""" + +import unittest +import numpy as np +import warnings +from recipe_classifier import RecipeClassifier + +class TestRecipeClassifier(unittest.TestCase): + """ + A test suite for the RecipeClassifier class. + + This class contains various test methods to ensure the correct functionality + of the RecipeClassifier, including data preprocessing, model training, and prediction. + """ + + def setUp(self): + """ + Set up the test environment before each test method. + + This method initializes a RecipeClassifier instance and defines sample + recipes and cuisines for testing purposes. + """ + print("\n--- Setting up test environment ---") + self.classifier = RecipeClassifier() + self.recipes = [ + "pasta tomato basil olive_oil garlic", + "tortilla beans salsa avocado cilantro", + "spaghetti meatballs tomato_sauce parmesan", + "tacos beef lettuce cheese salsa", + "pizza mozzarella tomato basil oregano", + "enchiladas chicken cheese salsa corn", + "risotto rice parmesan white_wine mushroom", + "guacamole avocado lime cilantro onion" + ] + self.cuisines = ["Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican"] + print(f"Initialized classifier with {len(self.recipes)} sample recipes") + + def test_init(self): + """ + Test the initialization of the RecipeClassifier. + + This test ensures that a new RecipeClassifier instance has its model + and vocabulary attributes properly initialized to None. + """ + print("\n--- Testing initialization ---") + print(f"Model: {self.classifier.model}") + print(f"Vocabulary: {self.classifier.vocabulary}") + self.assertIsNone(self.classifier.model, "Model should be None upon initialization") + self.assertIsNone(self.classifier.vocabulary, "Vocabulary should be None upon initialization") + print("Initialization test passed successfully") + + def test_preprocess_data(self): + """ + Test the data preprocessing method of RecipeClassifier. + + This test checks if the preprocess_data method correctly converts + the input recipes and cuisines into feature matrices and labels. + """ + X, y = self.classifier.preprocess_data(self.recipes, self.cuisines) + print(f"Preprocessed feature matrix shape: {X.shape}") + print(f"Label array shape: {y.shape}") + print(f"Unique labels: {np.unique(y)}") + # Check if X is a sparse matrix with correct dimensions + self.assertEqual(X.shape[0], len(self.recipes)) + self.assertGreater(X.shape[1], 0) + + # Check if y is a numpy array with correct length and values + self.assertIsInstance(y, np.ndarray) + self.assertEqual(len(y), len(self.cuisines)) + self.assertTrue(all(label in [1, -1] for label in y)) + print("Data preprocessing test passed successfully") + + def test_train(self): + """ + Test the training method of RecipeClassifier. + + This test checks if the train method successfully trains a model + and sets the model attribute of the classifier. + """ + print("\n--- Testing model training ---") + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + self.classifier.train(self.recipes, self.cuisines) + if any("dataset is very small" in str(warning.message) for warning in w): + print("Warning: Dataset is very small, as expected") + else: + print("No warning about small dataset was issued") + print(f"Model after training: {self.classifier.model}") + self.assertIsNotNone(self.classifier.model, "Model should not be None after training") + print("Training test passed successfully") + + def test_predict(self): + """ + Test the prediction method of RecipeClassifier. + + This test checks if the predict method returns the expected output + for new recipes after training the model. + """ + print("\n--- Testing prediction ---") + self.classifier.train(self.recipes, self.cuisines) + + new_recipes = [ + "pizza cheese tomato basil", + "burrito rice beans salsa" + ] + print("Predicting cuisines for new recipes:") + for recipe in new_recipes: + print(f" - {recipe}") + predictions = self.classifier.predict(new_recipes) + print("Predictions:", predictions) + # Check if predictions are returned for all new recipes + self.assertEqual(len(predictions), len(new_recipes), "Number of predictions should match number of new recipes") + + # Check if all predictions are either 'Italian' or 'Mexican' + self.assertTrue(all(cuisine in ['Italian', 'Mexican'] for cuisine in predictions), "All predictions should be either Italian or Mexican") + print("Prediction test passed successfully") + + def test_predict_without_training(self): + """ + Test prediction without prior training. + + This test ensures that attempting to make predictions without first + training the model raises a ValueError. + """ + print("\n--- Testing prediction without training ---") + with self.assertRaises(ValueError): + self.classifier.predict(["pizza cheese tomato basil"]) + print(f"Raised exception as expected!") + print("Prediction without training test passed successfully") + + def test_train_and_predict_accuracy(self): + print("\n--- Testing training and prediction accuracy ---") + self.classifier.train(self.recipes, self.cuisines) + predictions = self.classifier.predict(self.recipes) + accuracy = sum(p == c for p, c in zip(predictions, self.cuisines)) / len(self.cuisines) + print(f"Training accuracy: {accuracy:.2%}") + self.assertGreater(accuracy, 0.75, "Training accuracy should be above 75%") + print("Training and prediction accuracy test passed successfully") + + def test_vocabulary_creation(self): + print("\n--- Testing vocabulary creation ---") + self.classifier.train(self.recipes, self.cuisines) + print(f"Vocabulary size: {len(self.classifier.vocabulary)}") + self.assertIsNotNone(self.classifier.vocabulary, "Vocabulary should not be None after training") + expected_ingredients = ["pasta", "tomato", "basil", "olive_oil", "garlic", "tortilla", "beans", "salsa", + "avocado", "cilantro"] + for ingredient in expected_ingredients: + self.assertIn(ingredient, self.classifier.vocabulary, f"{ingredient} should be in the vocabulary") + print(f"'{ingredient}' found in vocabulary") + print("Vocabulary creation test passed successfully") + + def test_predict_new_recipes(self): + + # Train the classifier + print("\nTraining the classifier...") + self.classifier.train(self.recipes, self.cuisines) + print(f"Vocabulary size after training: {len(self.classifier.vocabulary)}") + + # New recipes to test + new_recipes = [ + "lasagna pasta cheese tomato_sauce beef", + "burrito rice beans salsa guacamole" + ] + + print("\nPredicting new recipes:") + for recipe in new_recipes: + print(f"Recipe: {recipe}") + + # Predict new recipes + predictions = self.classifier.predict(new_recipes) + + print("\nPrediction results:") + for recipe, prediction in zip(new_recipes, predictions): + print(f"Recipe: {recipe}") + print(f"Predicted cuisine: {prediction}") + + # Check predictions + self.assertEqual(predictions[0], "Italian", "Lasagna should be classified as Italian") + self.assertEqual(predictions[1], "Mexican", "Burrito should be classified as Mexican") + + print("\nTest passed successfully!") + +if __name__ == '__main__': + unittest.main() \ No newline at end of file