# import necessary libraries to perform analysis 
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


cancer_data_set = pd.read_csv("breast cancer data set.csv")


print(cancer_data_set)

           id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         17.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0            0.11840           0.27760         0.30010              0.14710   
1            0.08474           0.07864         0.08690              0.07017   
2            0.10960           0.15990         0.19740              0.12790   
3            0.14250           0.28390         0.24140              0.10520   
4            0.10030           0.13280         0.19800              0.10430   
..               ...               ...             ...                  ...   
564          0.11100           0.11590         0.24390              0.13890   
565          0.09780           0.10340         0.14400              0.09791   
566          0.08455           0.10230         0.09251              0.05302   
567          0.11780           0.27700         0.35140              0.15200   
568          0.05263           0.04362         0.00000              0.00000   

     ...  radius_worst  texture_worst  perimeter_worst  area_worst  \
0    ...        25.380          17.33           184.60      2019.0   
1    ...        24.990          23.41           158.80      1956.0   
2    ...        23.570          25.53           152.50      1709.0   
3    ...        14.910          26.50            98.87       567.7   
4    ...        22.540          16.67           152.20      1575.0   
..   ...           ...            ...              ...         ...   
564  ...        25.450          26.40           166.10      2027.0   
565  ...        23.690          38.25           155.00      1731.0   
566  ...        18.980          34.12           126.70      1124.0   
567  ...        25.740          39.42           184.60      1821.0   
568  ...         9.456          30.37            59.16       268.6   

     smoothness_worst  compactness_worst  concavity_worst  \
0             0.16220            0.66560           0.7119   
1             0.12380            0.18660           0.2416   
2             0.14440            0.42450           0.4504   
3             0.20980            0.86630           0.6869   
4             0.13740            0.20500           0.4000   
..                ...                ...              ...   
564           0.14100            0.21130           0.4107   
565           0.11660            0.19220           0.3215   
566           0.11390            0.30940           0.3403   
567           0.16500            0.86810           0.9387   
568           0.08996            0.06444           0.0000   

     concave points_worst  symmetry_worst  fractal_dimension_worst  
0                  0.2654          0.4601                  0.11890  
1                  0.1860          0.2750                  0.08902  
2                  0.2430          0.3613                  0.08758  
3                  0.2575          0.6638                  0.17300  
4                  0.1625          0.2364                  0.07678  
..                    ...             ...                      ...  
564                0.2216          0.2060                  0.07115  
565                0.1628          0.2572                  0.06637  
566                0.1418          0.2218                  0.07820  
567                0.2650          0.4087                  0.12400  
568                0.0000          0.2871                  0.07039  

[569 rows x 32 columns]


cancer_data_set.head(10)


cancer_data_set.tail(10)


print(cancer_data_set.dtypes)# to check the data type of the data set

id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
dtype: object


cancer_data_set.isnull().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


 
print("size of the DataFrame: ")
print(cancer_data_set.size)

size of the DataFrame: 
18208


cancer_data_set.describe()


#shape that returns a tuple with each index having the number of corresponding elements

print("Shape of DataFrame: ")
print(cancer_data_set.shape)

Shape of DataFrame: 
(569, 32)


# to check the length of each dimention
print("Number of Dimensions:")
print(cancer_data_set.ndim)

Number of Dimensions:
2


# Creating the Target Class
# Storing the Class label or Target in --> y (M or B) 
y_target = cancer_data_set.diagnosis   

# Making a list of unwanted columns 
list = ['id','diagnosis']

# Dropping the unnecessary Column
data = cancer_data_set.drop(list,axis = 1 ) # Dropping Column `axis = 1`, for rows `axis = 0` 
data.head(10)


"""
Create dataset for finding contribution of individual features towards whether 
or not a certain cancer tumor is malignant or benign. 
"""

# Map values in diagnosis column, 0 representing benign  and 1 represeting malignant 
cancer_data_set['diagnosis'] = cancer_data_set['diagnosis'].map({'B': 0, 'M': 1}) 
data_for_corr = cancer_data_set[['radius_mean', 'perimeter_mean', 'area_mean',
                               'compactness_mean', 'concavity_mean',
                               'concave points_mean', 'diagnosis']]
#Create data_for_corr with various features and diagnosis 
data_for_corr.head()


sns.pairplot(data_for_corr, palette='coolwarm',hue= 'diagnosis')

<seaborn.axisgrid.PairGrid at 0x204af7a3460>


# checking the distribution for Target Varible using seaborn library:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
ax = sns.countplot(y_target, label="Count") # countplot: tells us the count of each class in column.
B, M = y_target.value_counts() # Using -`value_counts` from the Pandas to store the individual count.
print('Number of Benign tumors : ', B)
print('Number of Malignant tumors : ', M)

Number of Benign tumors :  357
Number of Malignant tumors :  212


data.describe() # Pandas:- descriptive statistics


# first ten features

x_features = data
data_n = (data - data.mean()) / (data.std())  # data normalization for plotting

# get all the features -- since axis = 1, Columnwise Concatenation
data_vis = pd.concat([y_target, data_n.iloc[:,0:10]], axis=1)

# let's flat the dataset
# `pd.melt` -- Unpivot the given DataFrame from wide format to long format 
# it Massages a DataFrame into a right format
data_vis = pd.melt(data_vis, id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(20,10))


sns.violinplot(x = "features",
               y = "value",
               hue = "diagnosis",
               data = data_vis,
               split = True, 
               inner = "quart"
               )

plt.xticks(rotation=45); # --matplotlib--


# Second ten features
data_vis = pd.concat([y_target, data_n.iloc[:,10:20]], axis=1)

data_vis = pd.melt(data_vis, id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(20,10))

sns.violinplot(x = "features",
               y = "value",
               hue = "diagnosis",
               data = data_vis,
               split = True, 
               inner = "quart")

plt.xticks(rotation=45); # --matplotlib--


# Third ten features
data_vis = pd.concat([y_target, data_n.iloc[:,20:31]], axis=1)
data_vis = pd.melt(data_vis, id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(20,10))

sns.violinplot(x = "features",
               y = "value",
               hue = "diagnosis",
               data = data_vis,
               split = True, 
               inner = "quart")

plt.xticks(rotation=45); # --matplotlib--


# As an alternative of violin plot, box plot can be used
# box plots are also useful in terms of seeing outliers
# I do not visualize all features with box plot
# In order to show you lets have an example of box plot
# If you want, you can visualize other features as well.
plt.figure(figsize=(20,10))
sns.boxplot(x="features",
            y="value",
            hue="diagnosis",
            data=data_vis)

plt.xticks(rotation=45);


#BoxPlot are good alternative when we want to detect the outliers.


# Checking how co-related the two feature are
sns.jointplot(x_features.loc[:,'concavity_worst'],
              x_features.loc[:,'concave points_worst'],
              kind="reg")

<seaborn.axisgrid.JointGrid at 0x204b2d720d0>


sns.set(style="white")
df = x_features.loc[:,['radius_worst','perimeter_worst','area_worst']]
g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3);


# First Ten features

import warnings
warnings.simplefilter(action="ignore", category=UserWarning)
sns.set(style="whitegrid", palette="muted")
data_dia = y_target #data diameter

data_n = (data - data.mean()) / (data.std())  

data_vis = pd.concat([y_target, data_n.iloc[:,0:10]],axis=1)
data_vis = pd.melt(data_vis, id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(20,10))
sns.swarmplot(x="features",
              y="value",
              hue="diagnosis",
              data=data_vis)

plt.xticks(rotation=45);


# Second ten features

import warnings
warnings.simplefilter(action="ignore", category=UserWarning)

data_vis = pd.concat([y_target, data_n.iloc[:,10:20]],axis=1)
data_vis = pd.melt(data_vis, id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(20,10))
sns.swarmplot(x="features",
              y="value",
              hue="diagnosis",
              data=data_vis)

plt.xticks(rotation=45);


# Third ten features

import warnings
warnings.simplefilter(action="ignore", category=UserWarning)

data_vis = pd.concat([y_target, data_n.iloc[:,20:31]],axis=1)
data_vis = pd.melt(data_vis, id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(20,10))
sns.swarmplot(x="features",
              y="value",
              hue="diagnosis",
              data=data_vis)

plt.xticks(rotation=45);


#A swarm plot can be drawn on its own, but it is also a good complement to a box or violin plot in cases 
#where you want to show all observations along with some representation of the underlying distribution.

#So by looking at the variance of swarm plot we can tell how well seprated they are, & which features are best suitable for classification.


# Correlation of each feature and our target variable

data_vis = pd.concat([y_target, data_n.iloc[:,0:30]],axis=1)
data_vis = pd.melt(data_vis, id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

plt.figure(figsize=(30,20))
sns.swarmplot(x="features",
              y="value",
              hue="diagnosis",
              data=data_vis)

plt.xticks(rotation=90);


# Pair-wise correlation across all the varible
f,ax = plt.subplots(figsize=(25, 25))
sns.heatmap(x_features.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax, cmap="YlGnBu");


x_train, x_test, y_train, y_test = train_test_split(data_n.iloc[:,0:30], y_target, test_size=0.3, random_state=42)

clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(x_train, y_train)

# Evaluate the classifier on the testing set
y_pred = clf.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='M')
recall = recall_score(y_test, y_pred, pos_label='M')
f1 = f1_score(y_test, y_pred, pos_label='M')

fig = plt.figure(figsize=(20,10))
tree.plot_tree(clf, filled=True)
[...]

[Ellipsis]


#evaluation metrics of the classifier on the testing set

print('Accuracy: {:.3f}'.format(accuracy))
print('Precision: {:.3f}'.format(precision))
print('Recall: {:.3f}'.format(recall))
print('F1-score: {:.3f}'.format(f1))

Accuracy: 0.942
Precision: 0.896
Recall: 0.952
F1-score: 0.923

	id	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
count	5.690000e+02	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	...	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000
mean	3.037183e+07	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	...	16.269190	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946
std	1.250206e+08	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	...	4.833242	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061
min	8.670000e+03	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	...	7.930000	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040
25%	8.692180e+05	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	...	13.010000	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460
50%	9.060240e+05	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	...	14.970000	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040
75%	8.813129e+06	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	...	18.790000	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080
max	9.113205e+08	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	...	36.040000	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
count	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	...	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000
mean	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	0.062798	...	16.269190	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946
std	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	0.007060	...	4.833242	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061
min	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	0.049960	...	7.930000	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040
25%	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	0.057700	...	13.010000	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460
50%	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	0.061540	...	14.970000	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040
75%	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	0.066120	...	18.790000	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080
max	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	0.097440	...	36.040000	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500

Breast Cancer Analysis using Python¶

Data set Understanding¶

Displaying Head¶

Displaying Tail¶

Data Acquisition¶

Displaying Type¶

Displaying Null¶

Display Size¶

Displaying shape¶

Displaying Dimension¶

Data Prepartion¶

Data preparation¶

Creating the Pairplot from Seaborn¶

Data Exploration¶

Pairplot¶

Separating and discretizing¶

Count plot¶

Data Visualization¶

Visualizing Standardized Data with Seaborn¶

Violin Plots and Box Plots¶

Using Joint Plots for Feature Comparison¶

Data Wrangling univatiate filters¶

Observing the Distribution of Values and their Variance with Swarm Plots¶

how good we predict the target using correlation features¶

Observing all Pair-wise Correlations¶

Data Wrangling Heatmap¶

Data Wrangling Decision Tree Classifier¶

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	842302	1	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	842517	1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	84300903	1	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	84348301	1	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	84358402	1	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678
5	843786	1	12.45	15.70	82.57	477.1	0.12780	0.17000	0.15780	0.08089	...	15.47	23.75	103.40	741.6	0.1791	0.5249	0.5355	0.1741	0.3985	0.12440
6	844359	1	18.25	19.98	119.60	1040.0	0.09463	0.10900	0.11270	0.07400	...	22.88	27.66	153.20	1606.0	0.1442	0.2576	0.3784	0.1932	0.3063	0.08368
7	84458202	1	13.71	20.83	90.20	577.9	0.11890	0.16450	0.09366	0.05985	...	17.06	28.14	110.60	897.0	0.1654	0.3682	0.2678	0.1556	0.3196	0.11510
8	844981	1	13.00	21.82	87.50	519.8	0.12730	0.19320	0.18590	0.09353	...	15.49	30.73	106.20	739.3	0.1703	0.5401	0.5390	0.2060	0.4378	0.10720
9	84501001	1	12.46	24.04	83.97	475.9	0.11860	0.23960	0.22730	0.08543	...	15.09	40.68	97.65	711.4	0.1853	1.0580	1.1050	0.2210	0.4366	0.20750

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
559	925291	0	11.51	23.93	74.52	403.5	0.09261	0.10210	0.11120	0.04105	...	12.480	37.16	82.28	474.2	0.12980	0.25170	0.3630	0.09653	0.2112	0.08732
560	925292	0	14.05	27.15	91.38	600.4	0.09929	0.11260	0.04462	0.04304	...	15.300	33.17	100.20	706.7	0.12410	0.22640	0.1326	0.10480	0.2250	0.08321
561	925311	0	11.20	29.37	70.67	386.0	0.07449	0.03558	0.00000	0.00000	...	11.920	38.30	75.19	439.6	0.09267	0.05494	0.0000	0.00000	0.1566	0.05905
562	925622	1	15.22	30.62	103.40	716.9	0.10480	0.20870	0.25500	0.09429	...	17.520	42.79	128.70	915.0	0.14170	0.79170	1.1700	0.23560	0.4089	0.14090
563	926125	1	20.92	25.09	143.00	1347.0	0.10990	0.22360	0.31740	0.14740	...	24.290	29.41	179.10	1819.0	0.14070	0.41860	0.6599	0.25420	0.2929	0.09873
564	926424	1	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	...	25.450	26.40	166.10	2027.0	0.14100	0.21130	0.4107	0.22160	0.2060	0.07115
565	926682	1	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	...	23.690	38.25	155.00	1731.0	0.11660	0.19220	0.3215	0.16280	0.2572	0.06637
566	926954	1	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	...	18.980	34.12	126.70	1124.0	0.11390	0.30940	0.3403	0.14180	0.2218	0.07820
567	927241	1	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	...	25.740	39.42	184.60	1821.0	0.16500	0.86810	0.9387	0.26500	0.4087	0.12400
568	92751	0	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	...	9.456	30.37	59.16	268.6	0.08996	0.06444	0.0000	0.00000	0.2871	0.07039