import pandas as pd
from sklearn import metrics

!wget -q --show-progress ""

data = pd.read_csv('cancer.csv')
data['diagnosis'].replace({'M':1, 'B':0}, inplace = True)
del data
# First, import helpful Python tools for loading/navigating data
import os             # Good for navigating your computer's files
import numpy as np    # Great for lists (arrays) of numbers
import pandas as pd   # Great for tables (google spreadsheets, microsoft excel, csv)
from sklearn.metrics import accuracy_score   # Great for creating quick ML models
# This is the name of our data file, which was downloaded in the set up cell.
# Check out the file explorer (folder on the left toolbar) to see where that lives!
data_path = 'cancer.csv'

# Use the 'pd.read_csv(filepath)' function to read in read our data and store it
# in a variable called 'dataframe'
dataframe = pd.read_csv(data_path)

# Redefine `dataframe` to include only the columns discussed
dataframe = dataframe[['diagnosis', 'perimeter_mean', 'radius_mean', 'texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean', 'symmetry_mean']]

# Define a new, more descriptive `diagnosis_cat` column
dataframe['diagnosis_cat'] = dataframe['diagnosis'].astype('category').map({1: '1 (malignant)', 0: '0 (benign)'})
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
sns.catplot(x = 'radius_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(dataframe, test_size = 0.2, random_state = 1)
logreg_model = linear_model.LogisticRegression()
X = ['perimeter_mean','radius_mean','texture_mean','area_mean','smoothness_mean','concavity_mean','symmetry_mean']
y = 'diagnosis_cat'

X_train = train_df[X]
print('X_train, our input variables:')

y_train = train_df[y]
print('y_train, our output variable:')
X_train, our input variables:
     perimeter_mean  radius_mean  texture_mean  area_mean  smoothness_mean  \
408          117.80        17.99         20.66      991.7          0.10360   
4            135.10        20.29         14.34     1297.0          0.10030   
307           56.36         9.00         14.40      246.3          0.07005   
386           78.78        12.21         14.09      462.0          0.08108   
404           78.29        12.34         14.95      469.1          0.08682   

     concavity_mean  symmetry_mean  
408        0.120100         0.1992  
4          0.198000         0.1809  
307        0.003681         0.1788  
386        0.068390         0.1646  
404        0.021090         0.1571  

y_train, our output variable:
408    1 (malignant)
4      1 (malignant)
307       0 (benign)
386       0 (benign)
404       0 (benign)
Name: diagnosis_cat, dtype: category
Categories (2, object): ['0 (benign)', '1 (malignant)'], y_train)
X_test = test_df[X]
y_test = test_df[y]
y_pred = logreg_model.predict(X_test)
test_df['predicted'] = y_pred
accuracy = accuracy_score(y_test, y_pred)