{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Working with Pandas II\n", "Prepared by Dr. Kalpakis for CMSC 491/691, Fall 2017" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "import random" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading a CSV using *pandas*" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read the CSV into a dataframe. The CSV file is located at the same directory as the notebook file \n", "#frame = pd.read_csv(\"911_Calls_for_Service.csv\", nrows=10000)\n", "#frame.to_csv(\"911_Calls_for_Service_small.csv\")\n", "frame = pd.read_csv(\"911_Calls_for_Service_small.csv\")\n", "\n", "# make random sample of dataframe\n", "frame = frame.sample(frac=0.25, replace=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Peek at the head and tail of the data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0callDateTimeprioritydistrictdescriptioncallNumberincidentLocationlocation
5606560607/15/2015 12:54:00 AMMediumSWDISORDERLYP151960095W NORTH AV/N ROSEDALE ST(39.308957,-76.669325)
3869386907/14/2015 01:21:00 PMHighSEAUTO ACC/INJURYP1519514422700 BOSTON ST(39.2796259,-76.5785027)
4010401007/14/2015 09:22:00 PMMediumSDJUV DISTURBANCEP151953234BAY-BROOK ELEM MIDDLE(39.226381,-76.599821)
8165816507/15/2015 06:57:00 PMMediumSEDISORDERLYP1519625763400 BLK HUDSON ST(39.2824920,-76.5687395)
4630463007/14/2015 05:20:00 PMMediumCD911/NO VOICEP1519522931000 CATHEDRAL ST(39.3011812,-76.6180370)
\n", "
" ], "text/plain": [ " Unnamed: 0 callDateTime priority district description \\\n", "5606 5606 07/15/2015 12:54:00 AM Medium SW DISORDERLY \n", "3869 3869 07/14/2015 01:21:00 PM High SE AUTO ACC/INJURY \n", "4010 4010 07/14/2015 09:22:00 PM Medium SD JUV DISTURBANCE \n", "8165 8165 07/15/2015 06:57:00 PM Medium SE DISORDERLY \n", "4630 4630 07/14/2015 05:20:00 PM Medium CD 911/NO VOICE \n", "\n", " callNumber incidentLocation location \n", "5606 P151960095 W NORTH AV/N ROSEDALE ST (39.308957,-76.669325) \n", "3869 P151951442 2700 BOSTON ST (39.2796259,-76.5785027) \n", "4010 P151953234 BAY-BROOK ELEM MIDDLE (39.226381,-76.599821) \n", "8165 P151962576 3400 BLK HUDSON ST (39.2824920,-76.5687395) \n", "4630 P151952293 1000 CATHEDRAL ST (39.3011812,-76.6180370) " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Show the first 5 and last 5 rows of the DF\n", "frame.head(5)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0callDateTimeprioritydistrictdescriptioncallNumberincidentLocationlocation
1708170807/13/2015 07:13:00 PMMediumCD911/NO VOICEP151942795600 E FAYETTE ST(39.2906737,-76.6071600)
3788378807/14/2015 01:47:00 PMMediumED911/NO VOICEP151951521600 N CAROLINE ST(39.2971386,-76.5975453)
9768976807/16/2015 10:03:00 AMLowNEDOG BITEP151970829MORAVIA PARK ELEMENTARY(39.319454,-76.540329)
3687368707/14/2015 12:19:00 PMLowEDTransportP151951222300 N GAY ST(39.2940575,-76.6081374)
82382307/13/2015 06:42:00 PMLowEDBAIL OUTP151942693E LAFAYETTE AV/N BROADWAY(39.310655,-76.594903)
\n", "
" ], "text/plain": [ " Unnamed: 0 callDateTime priority district description \\\n", "1708 1708 07/13/2015 07:13:00 PM Medium CD 911/NO VOICE \n", "3788 3788 07/14/2015 01:47:00 PM Medium ED 911/NO VOICE \n", "9768 9768 07/16/2015 10:03:00 AM Low NE DOG BITE \n", "3687 3687 07/14/2015 12:19:00 PM Low ED Transport \n", "823 823 07/13/2015 06:42:00 PM Low ED BAIL OUT \n", "\n", " callNumber incidentLocation location \n", "1708 P151942795 600 E FAYETTE ST (39.2906737,-76.6071600) \n", "3788 P151951521 600 N CAROLINE ST (39.2971386,-76.5975453) \n", "9768 P151970829 MORAVIA PARK ELEMENTARY (39.319454,-76.540329) \n", "3687 P151951222 300 N GAY ST (39.2940575,-76.6081374) \n", "823 P151942693 E LAFAYETTE AV/N BROADWAY (39.310655,-76.594903) " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.tail(5)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(#rows,#cols)= (2500, 8)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0
count2500.000000
mean4985.444800
std2899.193284
min4.000000
25%2430.750000
50%4943.000000
75%7490.000000
max9999.000000
\n", "
" ], "text/plain": [ " Unnamed: 0\n", "count 2500.000000\n", "mean 4985.444800\n", "std 2899.193284\n", "min 4.000000\n", "25% 2430.750000\n", "50% 4943.000000\n", "75% 7490.000000\n", "max 9999.000000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get the shape (#rows, #cols) of the dataframe\n", "print('(#rows,#cols)=', frame.shape)\n", "\n", "#compute some basic statictics of tha data in each column\n", "frame.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preprocessing\n", "Create a fresh copy of the data frame for editing keeping the original intact." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#make a copy of the dataframe\n", "df = frame.copy()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0callDateTimeprioritydistrictdescriptionincidentLocationlocation
1708170807/13/2015 07:13:00 PMMediumCD911/NO VOICE600 E FAYETTE ST(39.2906737,-76.6071600)
3788378807/14/2015 01:47:00 PMMediumED911/NO VOICE600 N CAROLINE ST(39.2971386,-76.5975453)
9768976807/16/2015 10:03:00 AMLowNEDOG BITEMORAVIA PARK ELEMENTARY(39.319454,-76.540329)
3687368707/14/2015 12:19:00 PMLowEDTransport300 N GAY ST(39.2940575,-76.6081374)
82382307/13/2015 06:42:00 PMLowEDBAIL OUTE LAFAYETTE AV/N BROADWAY(39.310655,-76.594903)
\n", "
" ], "text/plain": [ " Unnamed: 0 callDateTime priority district description \\\n", "1708 1708 07/13/2015 07:13:00 PM Medium CD 911/NO VOICE \n", "3788 3788 07/14/2015 01:47:00 PM Medium ED 911/NO VOICE \n", "9768 9768 07/16/2015 10:03:00 AM Low NE DOG BITE \n", "3687 3687 07/14/2015 12:19:00 PM Low ED Transport \n", "823 823 07/13/2015 06:42:00 PM Low ED BAIL OUT \n", "\n", " incidentLocation location \n", "1708 600 E FAYETTE ST (39.2906737,-76.6071600) \n", "3788 600 N CAROLINE ST (39.2971386,-76.5975453) \n", "9768 MORAVIA PARK ELEMENTARY (39.319454,-76.540329) \n", "3687 300 N GAY ST (39.2940575,-76.6081374) \n", "823 E LAFAYETTE AV/N BROADWAY (39.310655,-76.594903) " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#drop the callNumber column (axis=1)\n", "df = df.drop(['callNumber'], axis=1)\n", "df.tail(5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Medium 1280\n", "Low 597\n", "High 342\n", "Non-Emergency 272\n", "Emergency 1\n", "Name: priority, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Let's take a look at the priority column\n", "df['priority'].value_counts()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetimetime24
161507/13/201506:45:00 PM18:45:00
932807/16/201504:03:00 AM04:03:00
576807/16/201501:11:00 AM01:11:00
958807/16/201508:56:00 AM08:56:00
527507/14/201508:33:00 PM20:33:00
170807/13/201507:13:00 PM19:13:00
378807/14/201501:47:00 PM13:47:00
976807/16/201510:03:00 AM10:03:00
368707/14/201512:19:00 PM12:19:00
82307/13/201506:42:00 PM18:42:00
\n", "
" ], "text/plain": [ " date time time24\n", "1615 07/13/2015 06:45:00 PM 18:45:00\n", "9328 07/16/2015 04:03:00 AM 04:03:00\n", "5768 07/16/2015 01:11:00 AM 01:11:00\n", "9588 07/16/2015 08:56:00 AM 08:56:00\n", "5275 07/14/2015 08:33:00 PM 20:33:00\n", "1708 07/13/2015 07:13:00 PM 19:13:00\n", "3788 07/14/2015 01:47:00 PM 13:47:00\n", "9768 07/16/2015 10:03:00 AM 10:03:00\n", "3687 07/14/2015 12:19:00 PM 12:19:00\n", "823 07/13/2015 06:42:00 PM 18:42:00" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# extract multiple named regex groups from a column in a dataframe\n", "dq = df['callDateTime'].str.extract('(?P[\\d]{2}/\\d{2}/\\d{4})\\s(?P