{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Working with Pandas II\n",
"Prepared by Dr. Kalpakis for CMSC 491/691, Fall 2017"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"import random"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reading a CSV using *pandas*"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Read the CSV into a dataframe. The CSV file is located at the same directory as the notebook file \n",
"#frame = pd.read_csv(\"911_Calls_for_Service.csv\", nrows=10000)\n",
"#frame.to_csv(\"911_Calls_for_Service_small.csv\")\n",
"frame = pd.read_csv(\"911_Calls_for_Service_small.csv\")\n",
"\n",
"# make random sample of dataframe\n",
"frame = frame.sample(frac=0.25, replace=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Peek at the head and tail of the data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" callDateTime | \n",
" priority | \n",
" district | \n",
" description | \n",
" callNumber | \n",
" incidentLocation | \n",
" location | \n",
"
\n",
" \n",
" \n",
" \n",
" 5606 | \n",
" 5606 | \n",
" 07/15/2015 12:54:00 AM | \n",
" Medium | \n",
" SW | \n",
" DISORDERLY | \n",
" P151960095 | \n",
" W NORTH AV/N ROSEDALE ST | \n",
" (39.308957,-76.669325) | \n",
"
\n",
" \n",
" 3869 | \n",
" 3869 | \n",
" 07/14/2015 01:21:00 PM | \n",
" High | \n",
" SE | \n",
" AUTO ACC/INJURY | \n",
" P151951442 | \n",
" 2700 BOSTON ST | \n",
" (39.2796259,-76.5785027) | \n",
"
\n",
" \n",
" 4010 | \n",
" 4010 | \n",
" 07/14/2015 09:22:00 PM | \n",
" Medium | \n",
" SD | \n",
" JUV DISTURBANCE | \n",
" P151953234 | \n",
" BAY-BROOK ELEM MIDDLE | \n",
" (39.226381,-76.599821) | \n",
"
\n",
" \n",
" 8165 | \n",
" 8165 | \n",
" 07/15/2015 06:57:00 PM | \n",
" Medium | \n",
" SE | \n",
" DISORDERLY | \n",
" P151962576 | \n",
" 3400 BLK HUDSON ST | \n",
" (39.2824920,-76.5687395) | \n",
"
\n",
" \n",
" 4630 | \n",
" 4630 | \n",
" 07/14/2015 05:20:00 PM | \n",
" Medium | \n",
" CD | \n",
" 911/NO VOICE | \n",
" P151952293 | \n",
" 1000 CATHEDRAL ST | \n",
" (39.3011812,-76.6180370) | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 callDateTime priority district description \\\n",
"5606 5606 07/15/2015 12:54:00 AM Medium SW DISORDERLY \n",
"3869 3869 07/14/2015 01:21:00 PM High SE AUTO ACC/INJURY \n",
"4010 4010 07/14/2015 09:22:00 PM Medium SD JUV DISTURBANCE \n",
"8165 8165 07/15/2015 06:57:00 PM Medium SE DISORDERLY \n",
"4630 4630 07/14/2015 05:20:00 PM Medium CD 911/NO VOICE \n",
"\n",
" callNumber incidentLocation location \n",
"5606 P151960095 W NORTH AV/N ROSEDALE ST (39.308957,-76.669325) \n",
"3869 P151951442 2700 BOSTON ST (39.2796259,-76.5785027) \n",
"4010 P151953234 BAY-BROOK ELEM MIDDLE (39.226381,-76.599821) \n",
"8165 P151962576 3400 BLK HUDSON ST (39.2824920,-76.5687395) \n",
"4630 P151952293 1000 CATHEDRAL ST (39.3011812,-76.6180370) "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Show the first 5 and last 5 rows of the DF\n",
"frame.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" callDateTime | \n",
" priority | \n",
" district | \n",
" description | \n",
" callNumber | \n",
" incidentLocation | \n",
" location | \n",
"
\n",
" \n",
" \n",
" \n",
" 1708 | \n",
" 1708 | \n",
" 07/13/2015 07:13:00 PM | \n",
" Medium | \n",
" CD | \n",
" 911/NO VOICE | \n",
" P151942795 | \n",
" 600 E FAYETTE ST | \n",
" (39.2906737,-76.6071600) | \n",
"
\n",
" \n",
" 3788 | \n",
" 3788 | \n",
" 07/14/2015 01:47:00 PM | \n",
" Medium | \n",
" ED | \n",
" 911/NO VOICE | \n",
" P151951521 | \n",
" 600 N CAROLINE ST | \n",
" (39.2971386,-76.5975453) | \n",
"
\n",
" \n",
" 9768 | \n",
" 9768 | \n",
" 07/16/2015 10:03:00 AM | \n",
" Low | \n",
" NE | \n",
" DOG BITE | \n",
" P151970829 | \n",
" MORAVIA PARK ELEMENTARY | \n",
" (39.319454,-76.540329) | \n",
"
\n",
" \n",
" 3687 | \n",
" 3687 | \n",
" 07/14/2015 12:19:00 PM | \n",
" Low | \n",
" ED | \n",
" Transport | \n",
" P151951222 | \n",
" 300 N GAY ST | \n",
" (39.2940575,-76.6081374) | \n",
"
\n",
" \n",
" 823 | \n",
" 823 | \n",
" 07/13/2015 06:42:00 PM | \n",
" Low | \n",
" ED | \n",
" BAIL OUT | \n",
" P151942693 | \n",
" E LAFAYETTE AV/N BROADWAY | \n",
" (39.310655,-76.594903) | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 callDateTime priority district description \\\n",
"1708 1708 07/13/2015 07:13:00 PM Medium CD 911/NO VOICE \n",
"3788 3788 07/14/2015 01:47:00 PM Medium ED 911/NO VOICE \n",
"9768 9768 07/16/2015 10:03:00 AM Low NE DOG BITE \n",
"3687 3687 07/14/2015 12:19:00 PM Low ED Transport \n",
"823 823 07/13/2015 06:42:00 PM Low ED BAIL OUT \n",
"\n",
" callNumber incidentLocation location \n",
"1708 P151942795 600 E FAYETTE ST (39.2906737,-76.6071600) \n",
"3788 P151951521 600 N CAROLINE ST (39.2971386,-76.5975453) \n",
"9768 P151970829 MORAVIA PARK ELEMENTARY (39.319454,-76.540329) \n",
"3687 P151951222 300 N GAY ST (39.2940575,-76.6081374) \n",
"823 P151942693 E LAFAYETTE AV/N BROADWAY (39.310655,-76.594903) "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.tail(5)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(#rows,#cols)= (2500, 8)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 2500.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 4985.444800 | \n",
"
\n",
" \n",
" std | \n",
" 2899.193284 | \n",
"
\n",
" \n",
" min | \n",
" 4.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 2430.750000 | \n",
"
\n",
" \n",
" 50% | \n",
" 4943.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 7490.000000 | \n",
"
\n",
" \n",
" max | \n",
" 9999.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0\n",
"count 2500.000000\n",
"mean 4985.444800\n",
"std 2899.193284\n",
"min 4.000000\n",
"25% 2430.750000\n",
"50% 4943.000000\n",
"75% 7490.000000\n",
"max 9999.000000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the shape (#rows, #cols) of the dataframe\n",
"print('(#rows,#cols)=', frame.shape)\n",
"\n",
"#compute some basic statictics of tha data in each column\n",
"frame.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Preprocessing\n",
"Create a fresh copy of the data frame for editing keeping the original intact."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#make a copy of the dataframe\n",
"df = frame.copy()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" callDateTime | \n",
" priority | \n",
" district | \n",
" description | \n",
" incidentLocation | \n",
" location | \n",
"
\n",
" \n",
" \n",
" \n",
" 1708 | \n",
" 1708 | \n",
" 07/13/2015 07:13:00 PM | \n",
" Medium | \n",
" CD | \n",
" 911/NO VOICE | \n",
" 600 E FAYETTE ST | \n",
" (39.2906737,-76.6071600) | \n",
"
\n",
" \n",
" 3788 | \n",
" 3788 | \n",
" 07/14/2015 01:47:00 PM | \n",
" Medium | \n",
" ED | \n",
" 911/NO VOICE | \n",
" 600 N CAROLINE ST | \n",
" (39.2971386,-76.5975453) | \n",
"
\n",
" \n",
" 9768 | \n",
" 9768 | \n",
" 07/16/2015 10:03:00 AM | \n",
" Low | \n",
" NE | \n",
" DOG BITE | \n",
" MORAVIA PARK ELEMENTARY | \n",
" (39.319454,-76.540329) | \n",
"
\n",
" \n",
" 3687 | \n",
" 3687 | \n",
" 07/14/2015 12:19:00 PM | \n",
" Low | \n",
" ED | \n",
" Transport | \n",
" 300 N GAY ST | \n",
" (39.2940575,-76.6081374) | \n",
"
\n",
" \n",
" 823 | \n",
" 823 | \n",
" 07/13/2015 06:42:00 PM | \n",
" Low | \n",
" ED | \n",
" BAIL OUT | \n",
" E LAFAYETTE AV/N BROADWAY | \n",
" (39.310655,-76.594903) | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 callDateTime priority district description \\\n",
"1708 1708 07/13/2015 07:13:00 PM Medium CD 911/NO VOICE \n",
"3788 3788 07/14/2015 01:47:00 PM Medium ED 911/NO VOICE \n",
"9768 9768 07/16/2015 10:03:00 AM Low NE DOG BITE \n",
"3687 3687 07/14/2015 12:19:00 PM Low ED Transport \n",
"823 823 07/13/2015 06:42:00 PM Low ED BAIL OUT \n",
"\n",
" incidentLocation location \n",
"1708 600 E FAYETTE ST (39.2906737,-76.6071600) \n",
"3788 600 N CAROLINE ST (39.2971386,-76.5975453) \n",
"9768 MORAVIA PARK ELEMENTARY (39.319454,-76.540329) \n",
"3687 300 N GAY ST (39.2940575,-76.6081374) \n",
"823 E LAFAYETTE AV/N BROADWAY (39.310655,-76.594903) "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#drop the callNumber column (axis=1)\n",
"df = df.drop(['callNumber'], axis=1)\n",
"df.tail(5)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Medium 1280\n",
"Low 597\n",
"High 342\n",
"Non-Emergency 272\n",
"Emergency 1\n",
"Name: priority, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Let's take a look at the priority column\n",
"df['priority'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" time | \n",
" time24 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1615 | \n",
" 07/13/2015 | \n",
" 06:45:00 PM | \n",
" 18:45:00 | \n",
"
\n",
" \n",
" 9328 | \n",
" 07/16/2015 | \n",
" 04:03:00 AM | \n",
" 04:03:00 | \n",
"
\n",
" \n",
" 5768 | \n",
" 07/16/2015 | \n",
" 01:11:00 AM | \n",
" 01:11:00 | \n",
"
\n",
" \n",
" 9588 | \n",
" 07/16/2015 | \n",
" 08:56:00 AM | \n",
" 08:56:00 | \n",
"
\n",
" \n",
" 5275 | \n",
" 07/14/2015 | \n",
" 08:33:00 PM | \n",
" 20:33:00 | \n",
"
\n",
" \n",
" 1708 | \n",
" 07/13/2015 | \n",
" 07:13:00 PM | \n",
" 19:13:00 | \n",
"
\n",
" \n",
" 3788 | \n",
" 07/14/2015 | \n",
" 01:47:00 PM | \n",
" 13:47:00 | \n",
"
\n",
" \n",
" 9768 | \n",
" 07/16/2015 | \n",
" 10:03:00 AM | \n",
" 10:03:00 | \n",
"
\n",
" \n",
" 3687 | \n",
" 07/14/2015 | \n",
" 12:19:00 PM | \n",
" 12:19:00 | \n",
"
\n",
" \n",
" 823 | \n",
" 07/13/2015 | \n",
" 06:42:00 PM | \n",
" 18:42:00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date time time24\n",
"1615 07/13/2015 06:45:00 PM 18:45:00\n",
"9328 07/16/2015 04:03:00 AM 04:03:00\n",
"5768 07/16/2015 01:11:00 AM 01:11:00\n",
"9588 07/16/2015 08:56:00 AM 08:56:00\n",
"5275 07/14/2015 08:33:00 PM 20:33:00\n",
"1708 07/13/2015 07:13:00 PM 19:13:00\n",
"3788 07/14/2015 01:47:00 PM 13:47:00\n",
"9768 07/16/2015 10:03:00 AM 10:03:00\n",
"3687 07/14/2015 12:19:00 PM 12:19:00\n",
"823 07/13/2015 06:42:00 PM 18:42:00"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# extract multiple named regex groups from a column in a dataframe\n",
"dq = df['callDateTime'].str.extract('(?P[\\d]{2}/\\d{2}/\\d{4})\\s(?P