diff --git a/examples.ipynb b/examples.ipynb new file mode 100644 index 00000000..c53799b4 --- /dev/null +++ b/examples.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ac622319", + "metadata": {}, + "source": [ + "# Descriptive statistics examples" + ] + }, + { + "cell_type": "markdown", + "id": "aa8993e4", + "metadata": {}, + "source": [ + "### Example 1" + ] + }, + { + "cell_type": "markdown", + "id": "5e0ab0d5", + "metadata": {}, + "source": [ + "Show by coding one example that the mean is the estimator with the Minimum Square Error and give the intuiton on that.\n", + "\n", + "Note:\n", + "\n", + "$$\n", + "MSE=\\sum_{i=1}^{n}(X_{i}-\\mu)^2\n", + "$$\n", + "\n", + "Consider the following array `X=[3,5,6,3,1,5,7,9,5,4]`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "34720ab6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Estimated mu: 4.799999977760703\n", + "Minimum MSE: 45.6\n", + "Populational mu: 4.8\n" + ] + } + ], + "source": [ + "# TODO\n" + ] + }, + { + "cell_type": "markdown", + "id": "46c70c3d", + "metadata": {}, + "source": [ + "### Example 2\n", + "\n", + "Consider this data:\n", + "````\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Set a seed for reproducibility\n", + "np.random.seed(0)\n", + "\n", + "# Generate left-skewed data using the gamma distribution\n", + "shape = 2 # Shape parameter (controls skewness, adjust as needed)\n", + "scale = 1 # Scale parameter (controls spread, adjust as needed)\n", + "size = 1000 # Number of data points\n", + "\n", + "# Generate left-skewed data\n", + "data = np.random.gamma(shape, scale, size)\n", + "\n", + "# Create a histogram\n", + "plt.hist(data, bins=30, density=True, alpha=0.6, color='b', label='Left-Skewed Data')\n", + "\n", + "# Add labels and a legend\n", + "plt.xlabel('Value')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Left-Skewed Distribution')\n", + "plt.legend(loc='upper right')\n", + "\n", + "# Show the plot\n", + "plt.show()\n", + "````\n", + "\n", + "1. Run the code.\n", + "2. Shows that the distribution of the mean will follow a normal distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d590308e", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO" + ] + }, + { + "cell_type": "markdown", + "id": "00a7f40c", + "metadata": {}, + "source": [ + "### Example 3\n", + "\n", + "Consider this data:\n", + "````\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Set a seed for reproducibility\n", + "np.random.seed(0)\n", + "\n", + "# Parameters for the uniform distribution\n", + "low = 0 # Lower bound\n", + "high = 10 # Upper bound\n", + "size = 1000 # Number of data points\n", + "\n", + "# Generate random data from a uniform distribution\n", + "data = np.random.uniform(low, high, size)\n", + "\n", + "# Create a histogram\n", + "plt.hist(data, bins=30, density=True, alpha=0.6, color='b', label='Uniform Data')\n", + "\n", + "# Add labels and a legend\n", + "plt.xlabel('Value')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Uniform Distribution')\n", + "plt.legend(loc='upper right')\n", + "\n", + "# Show the plot\n", + "plt.show()\n", + "````\n", + "\n", + "1. Run the code.\n", + "2. Shows that the distribution of the mean will follow a normal distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "12f8030c", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO\n" + ] + }, + { + "cell_type": "markdown", + "id": "2767a3c8", + "metadata": {}, + "source": [ + "### Example 4\n", + "\n", + "1. Simulate a normal distribution\n", + "2. Plot the histogram and add a vertical line for the mean and median.\n", + "3. Begin to add outliers and see how the mean, median and mode are affected.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "214ecc0d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# TODO\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ea06451", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0e55e5c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "9248718ffe6ce6938b217e69dbcc175ea21f4c6b28a317e96c05334edae734bb" + }, + "kernelspec": { + "display_name": "Python 3.9.12 ('ML-BOOTCAMP')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ryan/problems.ipynb b/ryan/problems.ipynb new file mode 100644 index 00000000..7821c51c --- /dev/null +++ b/ryan/problems.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ac622319", + "metadata": {}, + "source": [ + "# Descriptive statistics problems" + ] + }, + { + "cell_type": "markdown", + "id": "aa8993e4", + "metadata": {}, + "source": [ + "### Exercise 1" + ] + }, + { + "cell_type": "markdown", + "id": "5e0ab0d5", + "metadata": {}, + "source": [ + "We will use Numpy to obtain information to describe statistically.\n", + "\n", + "- Generate an array of 100 elements following a normal distribution.\n", + "- Generate an array of 100 elements following a chi-square distribution with 3 degrees of freedom.\n", + "- Calculate the main metrics and statistical measures that best describe the two vectors." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "34720ab6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "normal mean: -0.10384651739409385\n", + "normal median: -0.1269562917797126\n", + "normal mode: 0.4967141530112327\n", + "normal range: 4.472023288598682\n", + "normal var: 0.82476989363016\n", + "normal stdev: 0.9081684280078007\n", + "normal skew: -0.17526772024433726\n", + "normal kurt: -0.1554047077420817\n", + "chi mean: 2.9380795335328225\n", + "chi median: 2.4636148965577283\n", + "chi mode: 0.4168513022813494\n", + "chi range: 12.592089274962756\n", + "chi var: 5.87576054587392\n", + "chi stdev: 2.4239968122656266\n", + "chi skew: 1.6683703423622345\n", + "chi kurt: 3.620577909892315\n", + "\n", + "normal: [ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696\n", + " 1.57921282 0.76743473 -0.46947439 0.54256004 -0.46341769 -0.46572975\n", + " 0.24196227 -1.91328024 -1.72491783 -0.56228753 -1.01283112 0.31424733\n", + " -0.90802408 -1.4123037 1.46564877 -0.2257763 0.0675282 -1.42474819\n", + " -0.54438272 0.11092259 -1.15099358 0.37569802 -0.60063869 -0.29169375\n", + " -0.60170661 1.85227818 -0.01349722 -1.05771093 0.82254491 -1.22084365\n", + " 0.2088636 -1.95967012 -1.32818605 0.19686124 0.73846658 0.17136828\n", + " -0.11564828 -0.3011037 -1.47852199 -0.71984421 -0.46063877 1.05712223\n", + " 0.34361829 -1.76304016 0.32408397 -0.38508228 -0.676922 0.61167629\n", + " 1.03099952 0.93128012 -0.83921752 -0.30921238 0.33126343 0.97554513\n", + " -0.47917424 -0.18565898 -1.10633497 -1.19620662 0.81252582 1.35624003\n", + " -0.07201012 1.0035329 0.36163603 -0.64511975 0.36139561 1.53803657\n", + " -0.03582604 1.56464366 -2.6197451 0.8219025 0.08704707 -0.29900735\n", + " 0.09176078 -1.98756891 -0.21967189 0.35711257 1.47789404 -0.51827022\n", + " -0.8084936 -0.50175704 0.91540212 0.32875111 -0.5297602 0.51326743\n", + " 0.09707755 0.96864499 -0.70205309 -0.32766215 -0.39210815 -1.46351495\n", + " 0.29612028 0.26105527 0.00511346 -0.23458713]\n", + "chi: [ 0.4168513 1.53749288 2.0019707 3.31954478 2.93509884 2.17617828\n", + " 0.15830407 2.27652419 2.46587889 12.72456265 1.94196651 3.04725102\n", + " 5.77516859 4.36378602 6.86491387 0.42618426 0.78098785 1.3113313\n", + " 0.33084683 2.48460231 0.85740894 7.53845363 1.01734997 1.7044591\n", + " 4.56886089 0.55649438 0.29858042 2.75534614 2.9409741 4.4625349\n", + " 3.65222252 3.03548985 1.15103525 2.87187902 3.02542952 1.10535719\n", + " 9.13003056 3.51390358 0.79761542 4.48007115 5.09450961 3.3434575\n", + " 1.84236463 1.05445618 2.17070629 3.15061854 2.36153348 7.09243827\n", + " 3.96269904 0.92828493 0.70025868 3.53848141 1.8831237 4.24057767\n", + " 3.51221427 2.17951444 0.94044813 0.35233451 2.82705909 0.54422803\n", + " 0.89743356 2.68142092 2.4613509 0.63275656 5.54211583 5.43039306\n", + " 3.63214678 3.62852305 3.63218282 5.06050325 4.04232775 1.03035801\n", + " 1.85822454 11.7557897 0.17754328 1.45511551 5.56483363 1.10410752\n", + " 4.1308596 2.43317078 1.18979801 2.75974204 1.00588199 2.09414701\n", + " 3.60307013 4.7477682 1.450369 1.18741582 0.531925 4.9308192\n", + " 10.57550712 5.34984734 6.2796709 1.11415225 3.42940531 4.43865013\n", + " 0.74626305 0.13247338 0.50086102 2.0667793 ]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import statistics as stats\n", + "from scipy.stats import skew\n", + "from scipy.stats import kurtosis\n", + "\n", + "np.random.seed(42)\n", + "\n", + "elements_1 = np.random.normal(size=100)\n", + "elements_2 = np.random.chisquare(3, 100)\n", + "\n", + "mean_e1 = stats.mean(elements_1)\n", + "mean_e2 = stats.mean(elements_2)\n", + "\n", + "median_e1 = stats.median(elements_1)\n", + "median_e2 = stats.median(elements_2)\n", + "\n", + "mode_e1 = stats.mode(elements_1)\n", + "mode_e2 = stats.mode(elements_2)\n", + "\n", + "range_e1 = max(elements_1)-min(elements_1)\n", + "range_e2 = max(elements_2)-min(elements_2)\n", + "\n", + "var_e1 = stats.variance(elements_1)\n", + "var_e2 = stats.variance(elements_2)\n", + "\n", + "stdev_e1 = stats.stdev(elements_1)\n", + "stdev_e2 = stats.stdev(elements_2)\n", + "\n", + "skew_e1 = skew(elements_1)\n", + "skew_e2 = skew(elements_2)\n", + "\n", + "kurtosis_e1 = kurtosis(elements_1)\n", + "kurtosis_e2 = kurtosis(elements_2)\n", + "\n", + "print(f'''normal mean: {mean_e1}\n", + "normal median: {median_e1}\n", + "normal mode: {mode_e1}\n", + "normal range: {range_e1}\n", + "normal var: {var_e1}\n", + "normal stdev: {stdev_e1}\n", + "normal skew: {skew_e1}\n", + "normal kurtosis: {kurtosis_e1}\n", + "chi mean: {mean_e2}\n", + "chi median: {median_e2}\n", + "chi mode: {mode_e2}\n", + "chi range: {range_e2}\n", + "chi var: {var_e2}\n", + "chi stdev: {stdev_e2}\n", + "chi skew: {skew_e2}\n", + "chi kurtosis: {kurtosis_e2}\\n''')\n", + "print(f'''normal: {elements_1}''')\n", + "print(f'''chi: {elements_2}''')\n" + ] + }, + { + "cell_type": "markdown", + "id": "46c70c3d", + "metadata": {}, + "source": [ + "### Exercise 2\n", + "\n", + "Write a Python program to calculate the standard deviation of the following data:\n", + "\n", + "```py\n", + "data = [4, 2, 5, 8, 6]\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d590308e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The standard deviation of the list is 2.24\n" + ] + } + ], + "source": [ + "data = [4, 2, 5, 8, 6]\n", + "\n", + "# Calculate the mean\n", + "mean = sum(data) / len(data)\n", + "\n", + "# Calculate the variance\n", + "variance = sum((x - mean) ** 2 for x in data) / (len(data) - 1)\n", + "\n", + "# Calculate the standard deviation\n", + "std_dev = variance ** 0.5\n", + "\n", + "print(f\"The standard deviation of the list is {round(std_dev,2)}\")\n", + "\n", + " " + ] + } + ], + "metadata": { + "interpreter": { + "hash": "9248718ffe6ce6938b217e69dbcc175ea21f4c6b28a317e96c05334edae734bb" + }, + "kernelspec": { + "display_name": "Python 3.9.12 ('ML-BOOTCAMP')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}