Stats exercices with Python libs and AI

Besides pandas and numpy, there are other python libraries in statistics theme, such as: scipy, sklearn, matplotlib for data visualization or statsmodels, which provides classes and functions for the estimation of many different statistical models, as well as for conducting statistical tests, and statistical data exploration. Let’s see how to use these many options by practicing with AI.

Start with creating lists from data genereted from a CSV file (sample file available to download here: https://heart4datascience.com/2020/12/20/pandas/)

#stats lib
import numpy as np                    # pip install numpy
import pandas as pd                   # pip install pandas
from scipy import stats               # pip install scipy
import matplotlib.pyplot as plt       # pip install matplotlib
import seaborn as sns                 # pip install seaborn
import statsmodels.api as sm          # pip install statsmodel

# read file to be used on trainings
df = pd.read_csv('MW.csv')

# get data from the file into the list
IQ_Column = df['IQ']
Age_Column = df['Age']

IQ_List = IQ_Column.tolist()
Age_List = Age_Column.tolist()


print(IQ_List)
print(Age_List)
print(len(IQ_List))
print(len(Age_List))

And now let’s check the results of basic statistical operations such as the average or median based on libraries methods compare.

# AVERAGE MEASURES

# numpy
np_data = IQ_List
IQ_mean = np.mean(np_data)
IQ_median = np.median(np_data)
IQ_standard_dev = np.std(np_data)

print(IQ_mean, IQ_median, IQ_standard_dev)
# result: 112.35 113.0 23.31903728716089

# pandas
pd_data = pd.Series(IQ_List)
IQ_mean = pd_data.mean()
IQ_median = pd_data.median()
IQ_standard_dev = pd_data.std()

print(IQ_mean, IQ_median, IQ_standard_dev)
# result: 112.35 113.0 23.616107063199742

# scipy
scp_data = IQ_List[:6]
scp_mean = stats.tmean(scp_data)
scp_median = stats.scoreatpercentile(scp_data,20)
scp_mode = stats.mode(scp_data)
scp_dev = stats.tstd(scp_data)

print(scp_data)             # [132, 150, 123, 129, 132, 90]
print(scp_mean)             # 126.0
print(scp_median)           # 123.0
print(scp_mode)             # ModeResult(mode=132, count=2)
print(scp_dev)              # 19.809088823063014

# statsmodel
scp_data = Age_List[:6]
sm_tmean = sm.tsa.stattools.stats.tmean(Age_List)  
# 47.85
sm_gmean = sm.tsa.stattools.stats.gmean(Age_List)  
# 40.90597080827608
print(sm_tmean)
print(sm_gmean)

#seaborn and matplotlib

sns.histplot(Age_List)
plt.show()

Numpy and Pandas with AI

Before we move further with deeper python and machine learning, let’s go back for a moment, what we used pandas and numpy for. Let AI refresh our memory.

Hello AI!, let's go through the pandas and numpy library in Python today. Can you show me some useful examples ?

Installation

Import into current File:

import pandas as pd
import numpy as np

NumPy

Basics

import numpy as np

a = np.array([1,2,3,4])
print(a)
#[1 2 3 4]

b = np.array([[1,2],[3,4]])
print(b)
# [[1 2]
#  [3 4]]

zeros = np.zeros((2,2))
print(zeros)
# [[0. 0.]
#  [0. 0.]]

ones = np.ones((3,2))
print(ones)
# [[1. 1.]
#  [1. 1.]
#  [1. 1.]]

arange = np.arange(0,5,1)
print(arange)
arange = np.arange(0,10,2)
print(arange)
arange = np.arange(1,15,3)
print(arange)

#1: [0 1 2 3 4]
#2: [0 2 4 6 8]
#3: [ 1  4  7 10 13]

linspace = np.linspace(0,1,6)
print(linspace)
linspace = np.linspace(0,10,6)
print(linspace)
# [0.  0.2 0.4 0.6 0.8 1. ]
# [ 0.  2.  4.  6.  8. 10.]

# Ops on Arrays

x = np.array([1,2,3])
y = np.array([4,5,6])
print(x+y)
print(x*y)
# [5 7 9]
# [ 4 10 18]
a = np.array([[1,2],[3,4]])
b = np.array([[11,12],[13,14]])
print(np.dot(a,b))
# [[37  40]
#  [85  92]]
# [[1*11+2*13, 1*12+2*14],[3*11+4*13, 3*12+4*14]]
sqrt_array = np.sqrt([1, 4, 9, 16])
print(sqrt_array)
#[1. 2. 3. 4.]

angles = np.array([0, np.pi/2, np.pi])
sin_array = np.sin(angles)
print(sin_array)
#[0.0000000e+00 1.0000000e+00 1.2246468e-16]
log_array = np.log([1, np.e, np.e**2])
print(log_array)
#[0. 1. 2.]
#indexing
a = np.array([10, 20, 30, 40, 50])
print(a[1])  # 20

#slicing
b = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(b[1:, :2])
# will return a submatrix: second row, first two columns
#stats
data = np.array([1, 2, 3, 4, 5])
print(np.mean(data)) #3
print(np.median(data)) #3
print(np.std(data)) #1.4142135623730951

#reshape
a = np.arange(1, 13)
b = a.reshape((3, 4))
print(b)

#flat
flat = b.flatten()
print(flat)

Pandas

# DataFrames
data = {

    "age": [5, 12, 4, 18, 10],
    "color": ["brown", "cappucino", "white", "tri-color", "black"],
    "weight": [9, 112, 156, 6.5, 1.7]

}
animals = pd.DataFrame(data, index=["dog", "deer",
                                      "cow", "cat",
                                      "raven"])
print(animals)
#        age      color  weight
# dog      5      brown     9.0
# deer    12  cappucino   112.0
# cow      4      white   156.0
# cat     18  tri-color     6.5
# raven   10      black     1.7
#DataFrames from dicts
data = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'San Francisco'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Los Angeles'}
]
df = pd.DataFrame(data)
print(df)
#       Name  Age           City
# 0    Alice   25       New York
# 1      Bob   30  San Francisco
# 2  Charlie   35    Los Angeles
#Series
s = pd.Series([1, 2, 3, 4, 5],
              name="Numbers")
print(s)
df = pd.read_csv('MW.csv')
print(df[:4])
print(df.info())
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 40 entries, 0 to 39
# Data columns (total 8 columns):
#  #   Column  Non-Null Count  Dtype 
# ---  ------  --------------  ----- 
#  0   1       40 non-null     object
#  1   Gender  40 non-null     object
#  2   Age     40 non-null     int64 
#  3   IQ      40 non-null     int64 
#  4   PIQ     40 non-null     int64 
#  5   Weight  40 non-null     object
#  6   Height  40 non-null     object
#  7   ID      40 non-null     int64 
# dtypes: int64(4), object(4)
# memory usage: 2.6+ KB
print(df.describe())
#selected3  columns, first 3 rows 
selected_data = df[['Gender', 'Age', "ID"]].iloc[:3]
print(selected_data)

Modifying data

#Adding a new column
df['Salary'] = [50000, 60000, 70000]
print(df)

#Modify an existing column:
df['Age'] = df['Age'] + 1
print(df)

#Deleting a column:
df = df.drop(columns=['Salary'])
print(df)

Data group and merge


#Grouping data and calculating aggregates:
 
grouped = df.groupby('Age').mean()
print(grouped)

#merge DataFrame:
 
df1 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
})
df2 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'David'],
    'Salary': [50000, 60000, 70000]
})
merged = pd.merge(df1, df2, on='Name', how='inner')
print(merged)
#Saving data to a CSV file:
df.to_csv('output.csv', index=False)

#Saving data to Excel file:
df.to_excel('output.xlsx', index=False)

Operations on missing data

#Checking for missing values:
print(df.isnull().sum())

#Deleting rows with missing values:
df = df.dropna()
print(df)

#Filling in missing values:
df['Age'] = df['Age'].fillna(df['Age'].mean())
print(df)