{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Working with Pandas I\n", "#### Based on the Chapter 5 of the book \"Python for Data Analysis\" by Wes McKinney\n", "Prepared by Dr. Kalpakis for CMSC 491/691, Fall 2017" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import random\n", "import re" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1-d Series\n", "a series is a fixed-length dictionary with keys in sorted order" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Panda indexes (axes labels) are of these types: Index, Int64Index, MultiIndex, DateTimeInddex, PeriodIndex\n", "Columns and rows are often associated with suhc indexes (labels)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[35000 71000 16000 5000]\n", "Index(['Ohio', 'Texas', 'Oregon', 'Utah'], dtype='object')\n" ] }, { "data": { "text/plain": [ "Ohio 35000\n", "Utah 5000\n", "dtype: int64" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create 1-d sequence with 4 values and given labels in index\n", "obj = pd.Series([35000, 71000, 16000, 5000], index=['Ohio', 'Texas', 'Oregon', 'Utah'])\n", "print(obj.values)\n", "print(obj.index)\n", "# get subset of given labels\n", "obj[['Ohio', 'Utah']]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Ohio 35000\n", "Oregon 16000\n", "Texas 71000\n", "Utah 5000\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create a series from a dictionary as well\n", "data = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}\n", "obj = pd.Series(data)\n", "obj" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "California NaN\n", "Ohio 35000.0\n", "Oregon 16000.0\n", "Texas 71000.0\n", "dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#if a label doesn't have an associated value in dict then set it to NaN\n", "states = ['California', 'Ohio', 'Oregon', 'Texas']\n", "obj2 = pd.Series(data, index=states)\n", "obj2" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "California NaN\n", "Ohio 70000.0\n", "Oregon 32000.0\n", "Texas 142000.0\n", "Utah NaN\n", "dtype: float64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#series objects auto-align along their index\n", "obj+obj2" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CA NaN\n", "OH 35000.0\n", "OR 16000.0\n", "TX 71000.0\n", "dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#can update Series in place\n", "obj2.index = ['CA', 'OH', 'OR', 'TX']\n", "obj2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrames: 2-d tabular data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['year', 'state', 'pop', 'debt'], dtype='object')\n", "Index(['one', 'two', 'three', 'four', 'five'], dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7NaN
three2002Ohio3.6NaN
four2001Nevada2.4NaN
five2002Nevada2.9NaN
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 NaN\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 NaN\n", "five 2002 Nevada 2.9 NaN" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create dataframe from dictionary columns and given row and column labels\n", "# notice the NaN's in the column missing from the dictionary\n", "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],\n", " 'year': [2000, 2001, 2002, 2001, 2002],\n", " 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}\n", "rowLabels = ['one', 'two', 'three', 'four', 'five']\n", "columnLabels=['year', 'state', 'pop', 'debt']\n", "frame = pd.DataFrame(data, columns=columnLabels, index=rowLabels)\n", "print(frame.columns) # the column labels (series)\n", "print(frame.index) # the row labels (series)\n", "frame" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateyear
oneOhio2000
twoOhio2001
threeOhio2002
fourNevada2001
fiveNevada2002
\n", "
" ], "text/plain": [ " state year\n", "one Ohio 2000\n", "two Ohio 2001\n", "three Ohio 2002\n", "four Nevada 2001\n", "five Nevada 2002" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#subset by columns\n", "frame[['state', 'year']]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
three2002Ohio3.6NaN
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "three 2002 Ohio 3.6 NaN" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#subset by row labels\n", "frame.loc[['one', 'three']]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateyear
oneOhio2000
threeOhio2002
\n", "
" ], "text/plain": [ " state year\n", "one Ohio 2000\n", "three Ohio 2002" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#subset by both row and column labels\n", "frame.loc[['one', 'three']][['state', 'year']]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateyear
oneOhio2000
twoOhio2001
\n", "
" ], "text/plain": [ " state year\n", "one Ohio 2000\n", "two Ohio 2001" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# subset by row index and column labels\n", "frame[:2][['state', 'year']]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
three2002Ohio3.6NaN
four2001Nevada2.4NaN
five2002Nevada2.9NaN
\n", "
" ], "text/plain": [ " year state pop debt\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 NaN\n", "five 2002 Nevada 2.9 NaN" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#subset by value (boolean expression)\n", "frame[ frame['pop'] > 2]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5-6
two2001Ohio1.7-5
three2002Ohio3.6-10
four2001Nevada2.4-9
five2002Nevada2.9-2
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 -6\n", "two 2001 Ohio 1.7 -5\n", "three 2002 Ohio 3.6 -10\n", "four 2001 Nevada 2.4 -9\n", "five 2002 Nevada 2.9 -2" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#assign values to a column\n", "frame['debt'] = [random.randint(-10, 0) for _ in range(5)]\n", "frame" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# assign values to some rows of a column (with NaN for all the others): use a series object\n", "val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])\n", "frame['debt'] = val\n", "frame" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebttmp
one2000Ohio1.5NaNTrue
two2001Ohio1.7-1.2True
three2002Ohio3.6NaNTrue
four2001Nevada2.4-1.5False
five2002Nevada2.9-1.7False
\n", "
" ], "text/plain": [ " year state pop debt tmp\n", "one 2000 Ohio 1.5 NaN True\n", "two 2001 Ohio 1.7 -1.2 True\n", "three 2002 Ohio 3.6 NaN True\n", "four 2001 Nevada 2.4 -1.5 False\n", "five 2002 Nevada 2.9 -1.7 False" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create a new column and initialize it with an expression\n", "frame['tmp'] = frame.state == 'Ohio'\n", "frame" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#delete a column\n", "del frame['tmp']\n", "frame" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NevadaOhio
2000NaN1.5
20012.41.7
20022.93.6
\n", "
" ], "text/plain": [ " Nevada Ohio\n", "2000 NaN 1.5\n", "2001 2.4 1.7\n", "2002 2.9 3.6" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create dataframe from dict of dicts (outer dict gives the columns, inner dict the rows)\n", "pop = {'Nevada': {2001: 2.4, 2002: 2.9},\n", " 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}\n", "frame2 = pd.DataFrame(pop)\n", "frame2" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
six100100100.0100.0
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "six 100 100 100.0 100.0" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#re-index pandas by subsetting existing rows and adding new rows with fill-in value\n", "frame3 = frame.reindex(['one', 'two', 'six'], fill_value=100)\n", "frame3" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame3.drop(['six'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Apply functions to dataframe elements" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
debtpop
oneNaN4.481689
two0.3011945.473947
threeNaN36.598234
four0.22313011.023176
five0.18268418.174145
\n", "
" ], "text/plain": [ " debt pop\n", "one NaN 4.481689\n", "two 0.301194 5.473947\n", "three NaN 36.598234\n", "four 0.223130 11.023176\n", "five 0.182684 18.174145" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "# apply element-wise functions\n", "data = frame[['debt', 'pop']]\n", "np.exp(data)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "by col\n", "\n", " debt 0.5\n", "pop 2.1\n", "dtype: float64\n", "by row\n", "\n", " one 0.0\n", "two 2.9\n", "three 0.0\n", "four 3.9\n", "five 4.6\n", "dtype: float64\n" ] } ], "source": [ "f = lambda x: x.max() - x.min()\n", "print('by col\\n\\n', data.apply(f)) # aply f to each column (axis=0)\n", "print('by row\\n\\n', data.apply(f, axis=1)) # apply f to each row (along axis 1)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
debtpop
min-1.71.5
max-1.23.6
\n", "
" ], "text/plain": [ " debt pop\n", "min -1.7 1.5\n", "max -1.2 3.6" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# apply a function to each column; return a new frame\n", "def f(x): return pd.Series([x.min(), x.max()], index=['min', 'max'])\n", "ff = data.apply(f)\n", "ff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataframe sort, rank, and summarize" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
five2002Nevada2.9-1.7
four2001Nevada2.4-1.5
one2000Ohio1.5NaN
three2002Ohio3.6NaN
two2001Ohio1.7-1.2
\n", "
" ], "text/plain": [ " year state pop debt\n", "five 2002 Nevada 2.9 -1.7\n", "four 2001 Nevada 2.4 -1.5\n", "one 2000 Ohio 1.5 NaN\n", "three 2002 Ohio 3.6 NaN\n", "two 2001 Ohio 1.7 -1.2" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=0, ascending=True) # sort by rows\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
\n", "
" ], "text/plain": [ " year state pop debt\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 NaN" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_values(by=['state', 'pop']) # sort by columns" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one1.04.01.0NaN
two2.54.02.03.0
three4.54.05.0NaN
four2.51.53.02.0
five4.51.54.01.0
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 1.0 4.0 1.0 NaN\n", "two 2.5 4.0 2.0 3.0\n", "three 4.5 4.0 5.0 NaN\n", "four 2.5 1.5 3.0 2.0\n", "five 4.5 1.5 4.0 1.0" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get the rank of the values at each column\n", "frame.rank()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "year 10006\n", "state OhioOhioOhioNevadaNevada\n", "pop 12.1\n", "debt -4.4\n", "dtype: object" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# aggregate the values of each column\n", "frame.sum()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearpopdebt
count5.000005.0000003.000000
mean2001.200002.420000-1.466667
std0.836660.8642920.251661
min2000.000001.500000-1.700000
25%2001.000001.700000-1.600000
50%2001.000002.400000-1.500000
75%2002.000002.900000-1.350000
max2002.000003.600000-1.200000
\n", "
" ], "text/plain": [ " year pop debt\n", "count 5.00000 5.000000 3.000000\n", "mean 2001.20000 2.420000 -1.466667\n", "std 0.83666 0.864292 0.251661\n", "min 2000.00000 1.500000 -1.700000\n", "25% 2001.00000 1.700000 -1.600000\n", "50% 2001.00000 2.400000 -1.500000\n", "75% 2002.00000 2.900000 -1.350000\n", "max 2002.00000 3.600000 -1.200000" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#get basic statistical description for each column\n", "frame.describe()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.89197029966839891" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#compute correlation of two series (columns)\n", "frame['pop'].corr(frame['year'])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearpopdebt
year1.0000000.891970-0.802955
pop0.8919701.000000-0.999819
debt-0.802955-0.9998191.000000
\n", "
" ], "text/plain": [ " year pop debt\n", "year 1.000000 0.891970 -0.802955\n", "pop 0.891970 1.000000 -0.999819\n", "debt -0.802955 -0.999819 1.000000" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#compute the pair-wise correlations of a frame's columns\n", "frame.corr()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
two2001Ohio1.7-1.2
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
\n", "
" ], "text/plain": [ " year state pop debt\n", "two 2001 Ohio 1.7 -1.2\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop rows with NaN\n", "frame.dropna()\n", "# drop columns with NaN using axis=1" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5-99999.0
two2001Ohio1.7-1.2
three2002Ohio3.6-99999.0
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 -99999.0\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 -99999.0\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#fill missing values\n", "frame.fillna(-99999)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hierarchical indexes" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "a 1 0.877418\n", " 2 0.245937\n", " 3 0.923737\n", "b 1 0.370765\n", " 2 0.299855\n", " 3 0.257263\n", "c 1 0.173912\n", " 2 0.570717\n", "d 2 0.473231\n", " 3 0.030142\n", "dtype: float64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#index labels are now tuples - re\n", "data = pd.Series( [random.random() for _ in range(10)], \n", " index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], \n", " [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])\n", "data" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],\n", " labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.index" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "b 1 0.370765\n", " 2 0.299855\n", " 3 0.257263\n", "c 1 0.173912\n", " 2 0.570717\n", "dtype: float64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#subsetting by outermost index\n", "data['b':'c']" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
123
a0.8774180.2459370.923737
b0.3707650.2998550.257263
c0.1739120.570717NaN
dNaN0.4732310.030142
\n", "
" ], "text/plain": [ " 1 2 3\n", "a 0.877418 0.245937 0.923737\n", "b 0.370765 0.299855 0.257263\n", "c 0.173912 0.570717 NaN\n", "d NaN 0.473231 0.030142" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# for 2-tuples for index of series can be converted to a table with unstack()\n", "data.unstack()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
colorGreenRedGreen
key1key2
a1012
2345
b1678
291011
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "color Green Red Green\n", "key1 key2 \n", "a 1 0 1 2\n", " 2 3 4 5\n", "b 1 6 7 8\n", " 2 9 10 11" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# hierarchical indexes can be used for tabular pandas as well\n", "frame = pd.DataFrame(np.arange(12).reshape((4, 3)),\n", " index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],\n", " columns=[['Ohio', 'Ohio', 'Colorado'],\n", " ['Green', 'Red', 'Green']])\n", "# name the components of the hierarchical row and column indexes\n", "frame.index.names = ['key1', 'key2']\n", "frame.columns.names = ['state', 'color']\n", "frame" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
colorGreenRedGreen
key2
16810
2121416
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "color Green Red Green\n", "key2 \n", "1 6 8 10\n", "2 12 14 16" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# aggregate by sub-index\n", "frame.sum(level='key2')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colorGreenRed
key1key2
a121
284
b1147
22010
\n", "
" ], "text/plain": [ "color Green Red\n", "key1 key2 \n", "a 1 2 1\n", " 2 8 4\n", "b 1 14 7\n", " 2 20 10" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sum(level='color', axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combine pandas: merge and concat" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})\n", "df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1key
00b
11b
22a
33c
44a
55a
66b
\n", "
" ], "text/plain": [ " data1 key\n", "0 0 b\n", "1 1 b\n", "2 2 a\n", "3 3 c\n", "4 4 a\n", "5 5 a\n", "6 6 b" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data2key
00a
11b
22d
\n", "
" ], "text/plain": [ " data2 key\n", "0 0 a\n", "1 1 b\n", "2 2 d" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0 b 1\n", "1 1 b 1\n", "2 6 b 1\n", "3 2 a 0\n", "4 4 a 0\n", "5 5 a 0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2) # merge on index" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0 b 1\n", "1 1 b 1\n", "2 6 b 1\n", "3 2 a 0\n", "4 4 a 0\n", "5 5 a 0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2, on='key') #merge on another column\n", "# argumnents how=One of 'inner', 'outer', 'left' or 'right'. 'inner' by default\n", "# suffix= suffix to append to column names in case of overlaps" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2key
00b0.0a
11b1.0b
22a2.0d
33cNaNNaN
44aNaNNaN
55aNaNNaN
66bNaNNaN
\n", "
" ], "text/plain": [ " data1 key data2 key\n", "0 0 b 0.0 a\n", "1 1 b 1.0 b\n", "2 2 a 2.0 d\n", "3 3 c NaN NaN\n", "4 4 a NaN NaN\n", "5 5 a NaN NaN\n", "6 6 b NaN NaN" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# concat pandas along an axis\n", "pd.concat([df1, df2], axis=1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }