
No Data? No Problem! (Sklearn datasets)
In this post, I explained how to fetch dataset from in-built library - sklearn and perform simple preprocessing (merging and columns rename) for your data science project.
# reference: https://scikit-learn.org/stable/datasets.html
# import libraries
import pandas as pd
from sklearn import datasets
# help(datasets) # run to read more about the datasets
# Examples
# diabetes dataset
diabetes = datasets.load_diabetes()
dfx = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
dfy = pd.DataFrame(data=diabetes.target).rename(columns={0: "target"})
df = dfx.merge(dfy, left_index=True, right_index=True)
# print(df.shape)
# df.head()
# wine dataset
wine = datasets.load_wine()
dfx = pd.DataFrame(data=wine.data, columns=wine.feature_names)
dfy = pd.DataFrame(data=wine.target).rename(columns={0: "target"})
df = dfx.merge(dfy, left_index=True, right_index=True)
# print(df.shape)
# df.head()
# digits dataset
digits = datasets.load_digits()
dfx = pd.DataFrame(data=digits.data, columns=digits.feature_names)
dfy = pd.DataFrame(data=digits.target).rename(columns={0: "target"})
df = dfx.merge(dfy, left_index=True, right_index=True)
# print(df.shape)
# df.head()
# breast cancer dataset
breast_cancer = datasets.load_breast_cancer()
dfx = pd.DataFrame(data=breast_cancer.data, columns=breast_cancer.feature_names)
dfy = pd.DataFrame(data=breast_cancer.target).rename(columns={0: "target"})
df = dfx.merge(dfy, left_index=True, right_index=True)
# print(df.shape)
# df.head()
# load the iris dataset
iris = datasets.load_iris()
dfx = pd.DataFrame(data=iris.data, columns=iris.feature_names)
dfx = dfx.rename(columns=lambda x: x.replace(" ", "_")) # rename column names by replacing spaces with underscores
dfy = pd.DataFrame(data=iris.target).rename(columns={0: "target"})
df = dfx.merge(dfy, left_index=True, right_index=True)
print("Dimension of the Iris dataset: ", df.shape)
print()
print("First 10 rows of the Iris dataset:")
df.head(10)