Handling Data with Pandas

In this module we will learn how to handle data with the pandas library in Python. Pandas is a powerful library that provides data structures and data analysis tools for Python. It is widely used for data manipulation and analysis, making it an essential tool for data scientists and analysts.

# Import libraries
import pandas as pd
import numpy as np

pandas DataFrames

DataFrames are two-dimensional, size-mutable, potentially heterogeneous tabular data structures with labeled axes (rows and columns). They are similar to SQL tables or Excel spreadsheets. DataFrames can be created from various data sources, such as lists, dictionaries, or CSV files.

kids = [['Mike', 'boy', 7, 128], ['George', 'boy', 4, 112], ['Lily', 'girl', 8, 132], ['Yu', 'girl', 10, 143], ['Dan', 'boy', 9, 138]]

colNames = ['name', 'gender', 'age', 'height']

df = pd.DataFrame(data = kids, columns = colNames)
df

	name	gender	age	height
0	Mike	boy	7	128
1	George	boy	4	112
2	Lily	girl	8	132
3	Yu	girl	10	143
4	Dan	boy	9	138

# Check the column data types
df.dtypes

name      object
gender    object
age        int64
height     int64
dtype: object

Import data from a csv file

df = pd.read_csv('diabetes.csv')

df

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1
...	...	...	...	...	...	...	...	...	...
763	10	101	76	48	180	32.9	0.171	63	0
764	2	122	70	27	0	36.8	0.340	27	0
765	5	121	72	23	112	26.2	0.245	30	0
766	1	126	60	0	0	30.1	0.349	47	1
767	1	93	70	31	0	30.4	0.315	23	0

768 rows × 9 columns

# Select top N number of records (default = 5)
df.head()

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

# Select bottom N number of records (default = 5)
df.tail()

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
763	10	101	76	48	180	32.9	0.171	63	0
764	2	122	70	27	0	36.8	0.340	27	0
765	5	121	72	23	112	26.2	0.245	30	0
766	1	126	60	0	0	30.1	0.349	47	1
767	1	93	70	31	0	30.4	0.315	23	0

# Check the column data types
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

# Get the number of rows and columns in your dataframe
df.shape

(768, 9)

# The info method gives the column datatypes + number of non-null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

Subsetting DataFrames

# Select one column using double brackets
df[['Pregnancies']].head()

	Pregnancies
0	6
1	1
2	8
3	1
4	0

type(df[['Pregnancies']].head())

pandas.core.frame.DataFrame

# Select multiple columns using double brackets
df[['Pregnancies', 'Glucose']].head()

	Pregnancies	Glucose
0	6	148
1	1	85
2	8	183
3	1	89
4	0	137

type(df[['Pregnancies', 'Glucose']].head())

pandas.core.frame.DataFrame

# Select one column using single brackets
# This produces a pandas series which is a one-dimentional array that can be labeled
df['Pregnancies'].head()

0    6
1    1
2    8
3    1
4    0
Name: Pregnancies, dtype: int64

type(df['Pregnancies'].head())

pandas.core.series.Series

# Keep in mind that you can't select multiple columns using single brackets
df['Pregnancies', 'Glucose']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.13.5/x64/lib/python3.13/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3811 try:
-> 3812     return self._engine.get_loc(casted_key)
   3813 except KeyError as err:

File pandas/_libs/index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7096, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: ('Pregnancies', 'Glucose')

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[18], line 2
      1 # Keep in mind that you can't select multiple columns using single brackets
----> 2 df['Pregnancies', 'Glucose']

File /opt/hostedtoolcache/Python/3.13.5/x64/lib/python3.13/site-packages/pandas/core/frame.py:4107, in DataFrame.__getitem__(self, key)
   4105 if self.columns.nlevels > 1:
   4106     return self._getitem_multilevel(key)
-> 4107 indexer = self.columns.get_loc(key)
   4108 if is_integer(indexer):
   4109     indexer = [indexer]

File /opt/hostedtoolcache/Python/3.13.5/x64/lib/python3.13/site-packages/pandas/core/indexes/base.py:3819, in Index.get_loc(self, key)
   3814     if isinstance(casted_key, slice) or (
   3815         isinstance(casted_key, abc.Iterable)
   3816         and any(isinstance(x, slice) for x in casted_key)
   3817     ):
   3818         raise InvalidIndexError(key)
-> 3819     raise KeyError(key) from err
   3820 except TypeError:
   3821     # If we have a listlike key, _check_indexing_error will raise
   3822     #  InvalidIndexError. Otherwise we fall through and re-raise
   3823     #  the TypeError.
   3824     self._check_indexing_error(key)

KeyError: ('Pregnancies', 'Glucose')

With a pandas series, we can select rows using slicing like this: [start_index:end_index]. The end_index is not inclusive. This behavior is very similar to Python lists.

df['Age']

0      50
1      31
2      32
3      21
4      33
       ..
763    63
764    27
765    30
766    47
767    23
Name: Age, Length: 768, dtype: int64

df['Age'][0:10]

0    50
1    31
2    32
3    21
4    33
5    30
6    26
7    29
8    53
9    54
Name: Age, dtype: int64

df.Age.head()

0    50
1    31
2    32
3    21
4    33
Name: Age, dtype: int64

type(df.Age.head())

pandas.core.series.Series

The pandas attribute .loc allow you to select columns, index, and slice your data

# pandas dataframe
df.loc[:, ['Age']].head()

	Age
0	50
1	31
2	32
3	21
4	33

# pandas series
df.loc[:, 'Age'].head()

0    50
1    31
2    32
3    21
4    33
Name: Age, dtype: int64

df['Pregnancies'].value_counts()

Pregnancies
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
17      1
15      1
Name: count, dtype: int64

Filtering DataFrames

# Notice that the filter produces a pandas series of True and False values
filter_pregnancies = df['Pregnancies'] >= 3
filter_pregnancies

0       True
1      False
2       True
3      False
4      False
       ...  
763     True
764    False
765     True
766    False
767    False
Name: Pregnancies, Length: 768, dtype: bool

filter_pregnancies.head()

0     True
1    False
2     True
3    False
4    False
Name: Pregnancies, dtype: bool

df[filter_pregnancies]

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
2	8	183	64	0	0	23.3	0.672	32	1
5	5	116	74	0	0	25.6	0.201	30	0
6	3	78	50	32	88	31.0	0.248	26	1
7	10	115	0	0	0	35.3	0.134	29	0
...	...	...	...	...	...	...	...	...	...
759	6	190	92	0	0	35.5	0.278	66	1
761	9	170	74	31	0	44.0	0.403	43	1
762	9	89	62	0	0	22.5	0.142	33	0
763	10	101	76	48	180	32.9	0.171	63	0
765	5	121	72	23	112	26.2	0.245	30	0

419 rows × 9 columns

df.loc[filter_pregnancies, :]

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
2	8	183	64	0	0	23.3	0.672	32	1
5	5	116	74	0	0	25.6	0.201	30	0
6	3	78	50	32	88	31.0	0.248	26	1
7	10	115	0	0	0	35.3	0.134	29	0
...	...	...	...	...	...	...	...	...	...
759	6	190	92	0	0	35.5	0.278	66	1
761	9	170	74	31	0	44.0	0.403	43	1
762	9	89	62	0	0	22.5	0.142	33	0
763	10	101	76	48	180	32.9	0.171	63	0
765	5	121	72	23	112	26.2	0.245	30	0

419 rows × 9 columns

# Notice that it looks like nothing changes
# This is because we didn't update the dataframe after applying the filter
df['Pregnancies'].value_counts()

Pregnancies
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
17      1
15      1
Name: count, dtype: int64

# Update the dataframe to contain observations with pregnancies <= 3
df = df.loc[filter_pregnancies, :]

df['Pregnancies'].value_counts()

Pregnancies
3     75
4     68
5     57
6     50
7     45
8     38
9     28
10    24
11    11
13    10
12     9
14     2
17     1
15     1
Name: count, dtype: int64

df['Age'].value_counts()

Age
28    20
41    18
37    18
31    16
27    15
30    15
22    15
29    15
42    14
24    14
25    14
45    13
34    13
38    13
36    13
26    12
43    12
39    12
32    12
40    12
33    10
46    10
35     8
51     8
23     7
50     7
58     6
44     6
60     5
54     5
57     5
21     5
49     5
52     5
48     5
47     5
63     4
55     4
56     3
53     3
66     3
69     2
61     2
67     2
65     2
59     1
81     1
64     1
62     1
70     1
68     1
Name: count, dtype: int64

# Notice that the filter produces a pandas series of True and False values
df['Age'] <= 38

0      False
2       True
5       True
6       True
7       True
       ...  
759    False
761    False
762     True
763    False
765     True
Name: Age, Length: 419, dtype: bool

filter_age = df['Age'] <= 38

df = df.loc[filter_age, :]

df['Age'].value_counts()

Age
28    20
37    18
31    16
30    15
22    15
27    15
29    15
24    14
25    14
34    13
36    13
38    13
32    12
26    12
33    10
35     8
23     7
21     5
Name: count, dtype: int64

Combining filters

df.loc[filter_pregnancies & filter_age, :]

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
2	8	183	64	0	0	23.3	0.672	32	1
5	5	116	74	0	0	25.6	0.201	30	0
6	3	78	50	32	88	31.0	0.248	26	1
7	10	115	0	0	0	35.3	0.134	29	0
10	4	110	92	0	0	37.6	0.191	30	0
...	...	...	...	...	...	...	...	...	...
748	3	187	70	22	200	36.4	0.408	36	1
750	4	136	70	0	0	31.2	1.182	22	1
752	3	108	62	24	0	26.0	0.223	25	0
762	9	89	62	0	0	22.5	0.142	33	0
765	5	121	72	23	112	26.2	0.245	30	0

235 rows × 9 columns

Managing DataFrame columns

# Rename column names
df3 = df.rename(columns={
    'BloodPressure': 'BP',
    'SkinThickness': 'ST',
    'DiabetesPedigreeFunction': 'DPF'
})

df3

	Pregnancies	Glucose	BP	ST	Insulin	BMI	DPF	Age	Outcome
2	8	183	64	0	0	23.3	0.672	32	1
5	5	116	74	0	0	25.6	0.201	30	0
6	3	78	50	32	88	31.0	0.248	26	1
7	10	115	0	0	0	35.3	0.134	29	0
10	4	110	92	0	0	37.6	0.191	30	0
...	...	...	...	...	...	...	...	...	...
748	3	187	70	22	200	36.4	0.408	36	1
750	4	136	70	0	0	31.2	1.182	22	1
752	3	108	62	24	0	26.0	0.223	25	0
762	9	89	62	0	0	22.5	0.142	33	0
765	5	121	72	23	112	26.2	0.245	30	0

235 rows × 9 columns

# This approach allows you to drop multiple columns at a time
df3 = df3.drop(columns=['Insulin', 'BMI'])

df3

	Pregnancies	Glucose	BP	ST	DPF	Age	Outcome
2	8	183	64	0	0.672	32	1
5	5	116	74	0	0.201	30	0
6	3	78	50	32	0.248	26	1
7	10	115	0	0	0.134	29	0
10	4	110	92	0	0.191	30	0
...	...	...	...	...	...	...	...
748	3	187	70	22	0.408	36	1
750	4	136	70	0	1.182	22	1
752	3	108	62	24	0.223	25	0
762	9	89	62	0	0.142	33	0
765	5	121	72	23	0.245	30	0

235 rows × 7 columns

# Approach 2
del df3['Pregnancies']

df3

	Glucose	BP	ST	DPF	Age	Outcome
2	183	64	0	0.672	32	1
5	116	74	0	0.201	30	0
6	78	50	32	0.248	26	1
7	115	0	0	0.134	29	0
10	110	92	0	0.191	30	0
...	...	...	...	...	...	...
748	187	70	22	0.408	36	1
750	136	70	0	1.182	22	1
752	108	62	24	0.223	25	0
762	89	62	0	0.142	33	0
765	121	72	23	0.245	30	0

235 rows × 6 columns

Calculating statistics

df3.shape

(235, 6)

df3['Glucose'].sum()

np.int64(28536)

df3['DPF'].sum()

np.float64(102.571)

df3.sum()

Glucose    28536.000
BP         15807.000
ST          4272.000
DPF          102.571
Age         7004.000
Outcome       88.000
dtype: float64

'Mike' + 'George'

'MikeGeorge'

df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, 2 to 765
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Glucose  235 non-null    int64  
 1   BP       235 non-null    int64  
 2   ST       235 non-null    int64  
 3   DPF      235 non-null    float64
 4   Age      235 non-null    int64  
 5   Outcome  235 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 12.9 KB

df3.values

array([[1.83e+02, 6.40e+01, 0.00e+00, 6.72e-01, 3.20e+01, 1.00e+00],
       [1.16e+02, 7.40e+01, 0.00e+00, 2.01e-01, 3.00e+01, 0.00e+00],
       [7.80e+01, 5.00e+01, 3.20e+01, 2.48e-01, 2.60e+01, 1.00e+00],
       ...,
       [1.08e+02, 6.20e+01, 2.40e+01, 2.23e-01, 2.50e+01, 0.00e+00],
       [8.90e+01, 6.20e+01, 0.00e+00, 1.42e-01, 3.30e+01, 0.00e+00],
       [1.21e+02, 7.20e+01, 2.30e+01, 2.45e-01, 3.00e+01, 0.00e+00]],
      shape=(235, 6))

df3 = df3.loc[filter_age & filter_pregnancies, :]

df3

	Glucose	BP	ST	DPF	Age	Outcome
2	183	64	0	0.672	32	1
5	116	74	0	0.201	30	0
6	78	50	32	0.248	26	1
7	115	0	0	0.134	29	0
10	110	92	0	0.191	30	0
...	...	...	...	...	...	...
748	187	70	22	0.408	36	1
750	136	70	0	1.182	22	1
752	108	62	24	0.223	25	0
762	89	62	0	0.142	33	0
765	121	72	23	0.245	30	0

235 rows × 6 columns

# Export DataFrame to csv file
df3.to_csv(path_or_buf = 'diabetes2.csv', index = False)

df3.shape

(235, 6)