# Supervised Learning with scikit-learn

## Binary Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", 10000)

In [None]:
churn_df = pd.read_csv("Data/train.csv")

In [None]:
churn_df.head()

In [None]:
churn_df.shape

In [None]:
churn_df.info()

## Perform Exploratory Data Analysis (EDA)

In [None]:
print(churn_df['churn'].value_counts())

not_churned = churn_df[churn_df['churn']=='no']['churn'].count()
churned = churn_df[churn_df['churn']=='yes']['churn'].count()

fig = plt.figure(figsize=(5,5))
plt.pie([not_churned, churned],
       labels = ['not churned', 'churned'],
       autopct='%.2f')
plt.title('Pie chart customers churned vs not churned')
plt.show()

In [None]:
churn_df.describe()

## The supervised learning workflow

### k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X = churn_df[["total_day_charge", "total_eve_charge"]].values
y = churn_df["churn"].values

In [None]:
print(X.shape, y.shape)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 15)

In [None]:
knn.fit(X, y)

In [None]:
X_new = np.array([
    [56.8, 17.5],
    [24.4, 24.1],
    [50.1, 10.9]
])

In [None]:
print(X_new.shape)

In [None]:
predictions = knn.predict(X_new)

In [None]:
print("Predictions: {}".format(predictions))

## Measuring model performance

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21, stratify = y)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 6)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

### Plotting our results

In [None]:
train_accuracies = {}
test_accuracies = {}
neighbors = np.arange(1, 26)
for neighbor in neighbors:
  knn = KNeighborsClassifier(n_neighbors = neighbor)
  knn.fit(X_train, y_train)
  train_accuracies[neighbor] = knn.score(X_train, y_train)
  test_accuracies[neighbor] = knn.score(X_test, y_test)

### Model complexity curve

In [None]:
plt.figure(figsize=(8, 6))
plt.title("KNN: Varying Number of Neighbors")
plt.plot(neighbors, train_accuracies.values(), label = "Training Accuracy")
plt.plot(neighbors, test_accuracies.values(), label = "Testing Accuracy")
plt.legend()
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")
plt.show()

# Introduction to Regression

In [None]:
diabetes_df = pd.read_csv("Data/diabetes.csv")

In [None]:
diabetes_df.head(10)

## Creating feature and target arrays

In [None]:
X = diabetes_df.drop("Glucose", axis=1).values
y = diabetes_df["Glucose"].values
print(type(X), type(y))

## Making predictions from a single feature

In [None]:
X_bmi = X[:, 4]
print(y.shape, X_bmi.shape)

In [None]:
X_bmi = X_bmi.reshape(-1, 1)
print(X_bmi.shape)

## Plotting glucose vs. body mass index

In [None]:
plt.scatter(X_bmi, y)
plt.ylabel("Blood Glucose (mg/dl)")
plt.xlabel("Body Mass Index")
plt.show()

## Fitting a regression model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(X_bmi, y)

In [None]:
predictions = reg.predict(X_bmi)

In [None]:
plt.scatter(X_bmi, y)
plt.plot(X_bmi, predictions, color='red')
plt.ylabel("Blood Glucose (mg/dl)")
plt.xlabel("Body Mass Index")
plt.show()

## Linear regression using all features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
reg_all = LinearRegression()

In [None]:
reg_all.fit(X_train, y_train)

In [None]:
y_pred = reg_all.predict(X_test)

## R-squared

In [None]:
reg_all.score(X_test, y_test)

## Mean squared error and root mean squared error
### RMSE in scikit-learn

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_test, y_pred, squared=False)

## Cross-validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
kf = KFold(n_splits=6, shuffle=True, random_state=42)

In [None]:
reg = LinearRegression()

In [None]:
cv_results = cross_val_score(reg, X, y, cv=kf)

## Evaluating cross-validation peformance

In [None]:
print(cv_results)

In [None]:
print(np.mean(cv_results), np.std(cv_results))

In [None]:
print(np.quantile(cv_results, [0.025, 0.975]))