Would like to find out, what possibilities Python gives us for datasets.
We can use the following libraries for this:
scikit-learn
seaborn
TensorFlow (tfds)
PyTorch (torchvision and torchaudio)
Kaggle API
Today we will discuss the use of the first two with pandas elements, which we already know from previous lessons.
Hello AI, how to get well-known datasets in python? Show me libraries and method of how to get list of datasets form each library.
from sklearn import datasets
import seaborn as sns
# List of datasets
#sklearn
print(dir(datasets))
dt_sklearn = dir(datasets)
for d in dt_sklearn:
if d.startswith("load"):
print(d)
#seaborn
print(sns.get_dataset_names())
#or
for elem in sns.get_dataset_names():
print(elem)
sklearn and pandas
#sklearn and pandas
from sklearn.datasets import load_iris
import pandas as pd
# Load the iris dataset
iris = load_iris()
# Convert to pandas DataFrame
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
print(iris_df.head())
print(iris_df[:20])
# Save the DataFrame to a CSV file
iris_df.to_csv('iris_dataset.csv', index=False)
more sklearn and pandas
import requests
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import (cross_validate,
train_test_split)
from sklearn.linear_model import LinearRegression
import math
#open file to read it content, features, parameters
FilePath = r"C:\Users\Agnieszka\Desktop\Animals.csv"
GetFile = open(FilePath)
DataFile = GetFile.read()
#print(DataFile)
GetFile.close()
import chardet
with open(FilePath, 'rb') as fp:
result = chardet.detect(fp.read())
print(result) # Wyświetli wykryte kodowanie
df = pd.read_csv(FilePath, sep=';', na_values='No data',
encoding='ascii')
df = df.fillna('0.00')
#print(df)
#creating sum colun from 3 others
df = df[['Moose', 'Boars','Deer','Dama','Fawn']]
df['DeerFamily'] = df['Deer']+df['Fawn']+df['Dama']
df = df[['Moose','Boars','DeerFamily']]
#print(df)
forecast_col = 'DeerFamily'
forecast_out = int(math.ceil(0.1*len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
#print(df)
df = df.fillna('0.00')
print(df.head(2))
print(df.tail(3))
#regression
X = np.array(df.drop(['label'], axis=1))
print(X[:3])
y = np.array(df['label'])
X = preprocessing.scale(X)
#y = np.array(df('label'))
X_train, X_test, y_train, y_test = (
train_test_split(X,y,test_size=0.2))
clf = LinearRegression()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)
#output: -1.1791136212815982
seaborn
# seaborn
import seaborn as sns
import pandas as pd
# load the dataset
planets_df = sns.load_dataset('planets')
print(planets_df.columns)
print(planets_df.describe())
# Update data in dataframe
planets_df.fillna(0, inplace=True)
print("\n DataFrame Planets column Mass as example \
after in-place replacement of NaN with zeros:")
print(planets_df['mass'])
Mass_Column = planets_df['mass']
Mass_Column100 = Mass_Column[:100]
print(Mass_Column100)
Mass_List = Mass_Column100.tolist()
print(Mass_List)
# Save column after changes to csv, first 100 rows
Mass_List.to_csv('planets.csv', index=False)
more seaborn
from sklearn import datasets
import seaborn as sns
exercise_df = sns.load_dataset('exercise')
print(exercise_df.head())
print(exercise_df.tail())
print(exercise_df['kind'].drop_duplicates())
''' output:
0 rest
30 walking
60 running
Name: kind, dtype: category
Categories (3, object): ['rest', 'walking', 'running']'''
column = 'pulse'
# Znajdź maksymalną i minimalną wartość
max_value = exercise_df[column].max()
min_value = exercise_df[column].min()
print(f"Max value in column '{column}' is: {max_value}")
print(f"Min value in column '{column}' is: {min_value}")
print(exercise_df['diet'].value_counts())
sorted_data = exercise_df.sort_values(by='time',
ascending=False)
print(sorted_data)
exercise_df['pulse_category'] = (
exercise_df['pulse'].apply(lambda x:
'High' if x > 100 else 'Low'))
print(exercise_df[['pulse', 'pulse_category']])
aggregated = exercise_df.agg({
'pulse': ['mean', 'max', 'min'],
'time': ['nunique']
})
print(aggregated)