Scikit-learn, Seaborn and Pandas for Datasets

Would like to find out, what possibilities Python gives us for datasets.
We can use the following libraries for this:
scikit-learn
seaborn
TensorFlow (tfds)
PyTorch (torchvision and torchaudio)
Kaggle API
Today we will discuss the use of the first two with pandas elements, which we already know from previous lessons.

Hello AI, how to get well-known datasets in python? Show me libraries and method of how to get list of datasets form each library.
from sklearn import datasets
import seaborn as sns

# List of datasets
#sklearn
print(dir(datasets))
dt_sklearn = dir(datasets)

for d in dt_sklearn:
    if d.startswith("load"):
        print(d)

#seaborn
print(sns.get_dataset_names())
#or
for elem in sns.get_dataset_names():
    print(elem)

sklearn and pandas

#sklearn and pandas
from sklearn.datasets import load_iris
import pandas as pd

# Load the iris dataset
iris = load_iris()

# Convert to pandas DataFrame
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target

print(iris_df.head())
print(iris_df[:20])

# Save the DataFrame to a CSV file
iris_df.to_csv('iris_dataset.csv', index=False)

more sklearn and pandas

import requests
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import (cross_validate,
                                     train_test_split)
from sklearn.linear_model import LinearRegression
import math

#open file to read it content, features, parameters
FilePath = r"C:\Users\Agnieszka\Desktop\Animals.csv"
GetFile = open(FilePath)
DataFile = GetFile.read()
#print(DataFile)
GetFile.close()

import chardet

with open(FilePath, 'rb') as fp:
    result = chardet.detect(fp.read())
    print(result)  # Wyświetli wykryte kodowanie


df = pd.read_csv(FilePath, sep=';', na_values='No data', 
                 encoding='ascii')
df = df.fillna('0.00')
#print(df)

#creating sum colun from 3 others
df = df[['Moose', 'Boars','Deer','Dama','Fawn']]
df['DeerFamily'] = df['Deer']+df['Fawn']+df['Dama']
df = df[['Moose','Boars','DeerFamily']]
#print(df)

forecast_col = 'DeerFamily'
forecast_out = int(math.ceil(0.1*len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
#print(df)
df = df.fillna('0.00')
print(df.head(2))
print(df.tail(3))

#regression
X = np.array(df.drop(['label'], axis=1))
print(X[:3])
y = np.array(df['label'])
X = preprocessing.scale(X)
#y = np.array(df('label'))

X_train, X_test, y_train, y_test = (
    train_test_split(X,y,test_size=0.2))

clf = LinearRegression()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print(accuracy)
#output: -1.1791136212815982

seaborn

# seaborn
import seaborn as sns
import pandas as pd

# load the dataset
planets_df = sns.load_dataset('planets')
print(planets_df.columns)
print(planets_df.describe())

# Update data in dataframe
planets_df.fillna(0, inplace=True)

print("\n DataFrame Planets column Mass as example \
    after in-place replacement of NaN with zeros:")
print(planets_df['mass'])

Mass_Column = planets_df['mass']
Mass_Column100 = Mass_Column[:100]
print(Mass_Column100)
Mass_List = Mass_Column100.tolist()
print(Mass_List)

# Save column after changes to csv, first 100 rows
Mass_List.to_csv('planets.csv', index=False)

more seaborn

from sklearn import datasets
import seaborn as sns

exercise_df = sns.load_dataset('exercise')
print(exercise_df.head())
print(exercise_df.tail())
print(exercise_df['kind'].drop_duplicates())
''' output:
0        rest
30    walking
60    running
Name: kind, dtype: category
Categories (3, object): ['rest', 'walking', 'running']'''

column = 'pulse'

# Znajdź maksymalną i minimalną wartość
max_value = exercise_df[column].max()
min_value = exercise_df[column].min()

print(f"Max value in column '{column}' is: {max_value}")
print(f"Min value in column '{column}' is: {min_value}")

print(exercise_df['diet'].value_counts())
sorted_data = exercise_df.sort_values(by='time',
                                      ascending=False)
print(sorted_data)

exercise_df['pulse_category'] = (
    exercise_df['pulse'].apply(lambda x:
                               'High' if x > 100 else 'Low'))
print(exercise_df[['pulse', 'pulse_category']])

aggregated = exercise_df.agg({
    'pulse': ['mean', 'max', 'min'],
    'time': ['nunique']
})
print(aggregated)

Stats exercices with Python libs and AI

Besides pandas and numpy, there are other python libraries in statistics theme, such as: scipy, sklearn, matplotlib for data visualization or statsmodels, which provides classes and functions for the estimation of many different statistical models, as well as for conducting statistical tests, and statistical data exploration. Let’s see how to use these many options by practicing with AI.

Start with creating lists from data genereted from a CSV file (sample file available to download here: https://heart4datascience.com/2020/12/20/pandas/)

#stats lib
import numpy as np                    # pip install numpy
import pandas as pd                   # pip install pandas
from scipy import stats               # pip install scipy
import matplotlib.pyplot as plt       # pip install matplotlib
import seaborn as sns                 # pip install seaborn
import statsmodels.api as sm          # pip install statsmodel

# read file to be used on trainings
df = pd.read_csv('MW.csv')

# get data from the file into the list
IQ_Column = df['IQ']
Age_Column = df['Age']

IQ_List = IQ_Column.tolist()
Age_List = Age_Column.tolist()


print(IQ_List)
print(Age_List)
print(len(IQ_List))
print(len(Age_List))

And now let’s check the results of basic statistical operations such as the average or median based on libraries methods compare.

# AVERAGE MEASURES

# numpy
np_data = IQ_List
IQ_mean = np.mean(np_data)
IQ_median = np.median(np_data)
IQ_standard_dev = np.std(np_data)

print(IQ_mean, IQ_median, IQ_standard_dev)
# result: 112.35 113.0 23.31903728716089

# pandas
pd_data = pd.Series(IQ_List)
IQ_mean = pd_data.mean()
IQ_median = pd_data.median()
IQ_standard_dev = pd_data.std()

print(IQ_mean, IQ_median, IQ_standard_dev)
# result: 112.35 113.0 23.616107063199742

# scipy
scp_data = IQ_List[:6]
scp_mean = stats.tmean(scp_data)
scp_median = stats.scoreatpercentile(scp_data,20)
scp_mode = stats.mode(scp_data)
scp_dev = stats.tstd(scp_data)

print(scp_data)             # [132, 150, 123, 129, 132, 90]
print(scp_mean)             # 126.0
print(scp_median)           # 123.0
print(scp_mode)             # ModeResult(mode=132, count=2)
print(scp_dev)              # 19.809088823063014

# statsmodel
scp_data = Age_List[:6]
sm_tmean = sm.tsa.stattools.stats.tmean(Age_List)  
# 47.85
sm_gmean = sm.tsa.stattools.stats.gmean(Age_List)  
# 40.90597080827608
print(sm_tmean)
print(sm_gmean)

#seaborn and matplotlib

sns.histplot(Age_List)
plt.show()