Working with Data

TODO Figure out primitive list structure of

Data Science

Tools

  • Python
  • Pandas - working with table data
  • Numpy - working with matrix structures
  • Plotly
    • pip install plotly
    • conda install -c plotly plotly
  • RFPImp
    • pip install rfpimp
    • conda install -c conda-forge rfpimp
  • Tensorflow
    • pip install tensorflow
    • conda install -c conda-forge tensorflow

Info: For zsh, one needs to include this in .zshrc

source /.bash_profile

Pandas

import pandas as pd

Reading and writing

Data Formats

  • CSV - comma-separated values
  • XLSX - Excel (based on XML)
  • JSON - JavaScript Object Notation
  • HTML

Pandas IO

CSV Structure

ID, Name, Age, City
0, Tim, 25, Vienna
1, Angelica, 29, Bern
2, Alex, 31, Milano

Separators

  • , – typically comma-separated values
  • ; – Excel output
  • \t
  • #

Read a CSV file (into a pd.DataFrame)

import pandas as pd
df = pd.read_csv('example.csv', sep=';')
df

Read a XLSX file (into a pd.DataFrame)

import pandas as pd
df = pd.read_excel('example.xlsx', sheet_name='Sheet1')
df

Read a JSON file (into a pd.DataFrame)

import pandas as pd
df = pd.read_json('example.json')
df

DataFrame

Obtain (rows, columns)

df.shape

Show first 5 rows of data

df.head()

Get column names

df.columns

Drop any duplicates (overwrite df)

df = df.drop_duplicates()

Drop NaN values

df = df.dropna()

Sum not-a-number values

df.Price.isna().sum()

Sum "null" (no-value) values

df.isnull().sum()

Check sum (to check whether data was transferred completly, without corruption)

df.numberDataColumn.sum()

Check occurrences of values

df.jobs.unique()        # list of unique values
df.jobs.nunique()       # number of unique values
df.jobs.count()         # count values
df.jobs.value_counts()  # list & number of values
df.income.value_counts()/df.income.shape[0] # relative number of values

Univariate analysis of data in column

df.age.describe()  # statistics like mean, min/max etc.
df.age.median()    # median
df.age.mean()      # mean
df.age.min()       # min
df.age.max()       # max

Impute missing values with most frequent value

categorical_columns_missing = ["Gender", "Married"]

impute_missing = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

df[categorical_columns_missing] = impute_missing.fit_transform(df[categorical_columns_missing])

with mean value

impute_missing = SimpleImputer(missing_values=np.NaN, strategy='mean')

df["Amount"] = impute_missing.fit_transform(df[["Amount"]])

Queries in Pandas

df[df.job == "engineer"]

Query for data in a column

developers = df[df.job.isin(["developer", "programmer"])].index
wealthy    = df[df.income > 50000].occupation
wealthySalesperson = df[(df.occupation == "Salesperson") & (df.income > 50000)].shape[0]

Delete column(s)

df = df.drop(labels='Jobs', axis=0)     # axis=0 deletes row
df = df.drop(['Col1', 'Col2'], axis=1)  # axis=1 deletes column

Create new column

df["capital"] = df.capital_gain - df.capital_loss

Multiply the whole DataFrame by factor and round (e.g. fraction to percent)

df *= 100
df = df.astype(float).round(1)

Rename last column

df.columns = list(df.columns.values[:-1]) + ["newName"]

Replace entries

Example: Save lines to variable; Use lines to rewrite the values in the "job" column

engineers = df[df.job == "engineer"].index
engineers = df.loc[engineers, "job"] = "Engineer"

Replace

df.replace(to_replace = ["A", "B"], value ="AB") 
df.replace(to_replace = np.nan, value = -1)

Replacing commas

df.apply(lambda x: x.str.replace(',','.'))

Sort

df.sort_values(by="price")
df.sort_values(by=["AveragePrice"])

Group by

Group by values in column

df.groupby("name") 
df.groupby(["name", "age"]) 

returns groupby object (not a DataFrame!) that accepts manipulations such as mean() etc.

df.groupby(["occupation"]).salary.mean()
df.groupby(["occupation"]).salary.min()
df.groupby(["firstCategory", "secondCategory"]).count()["id"]

Reset index (to get back a DataFrame)

df.groupby(["occupation"]).salary.mean().reset_index()

Get one of the groups

df_group = df.get_group("GroupA")

Creating a DataFrame

Create DataFrame

df = pd.DataFrame({
                    'Year'  : year_list,
                    'Month' : month_list,
                    'Day'   : day_list
                  })

Create DataFrame with specified rows and columns

df = pd.DataFrame(index=['row1', 'row2', ...], columns=['column 1', 'column 2'])

for row_key in df.index:
  df[row_key] = [column_1[row_key], column_2[row_key]]

Loop through rows in DataFrame

for i, row in df.iterrows():
    print(df['Column1'][i])
    print(row['Column1'])

DataFrame from dictionary

data = {'name': [name], 'money': [money]}
df = pd.DataFrame(data, columns=["name", "money"])

Split a DataFrame

df_first500 = df.iloc[:500, :,]
df_501up    = df.iloc[500:, :,]

Pandas and NumPy

import pandas as pd
import numpy as np

create data frame

data = {
    'name': ['Eric', 'Lisa', 'John'],
    'age': [38,25,44],
    'salary': [13000, 2000, 8000]
}
df = pd.DataFrame(data)

data frame --> numpy array

np_array = df.to_numpy()

single column --> numpy array

np_array = df['age'].to_numpy()

multiple columns --> numpy array

np_array = df[['age', 'salary']].to_numpy()

integer valued columns --> numpy array

df.select_dtypes(include=int64).to_numpy()

Numpy

import numpy as np

Mean

np.mean(list_of_numbers)

Matplotlib

import matplotlib.pyplot as plt

Simple Pandas Visualisation

Histograms (1D, univariate)

df.age.hist()

Count Plot (1D, univariate)

df.workclass.value_counts().plot(kind="bar")

Scatter Plot (2D, bivariate)

df.plot.scatter("age", "capital")

Linear Correlation (2D, bivariate)

df.corr()

Plot

fig, ax = plt.subplots()
ax.plot(x, x, label='linear')
ax.plot(x, x**2, label='quadratic')
ax.plot(x, x**3, label='cubic')
ax.set_xlabel('x label')
ax.set_ylabel('y label')
ax.set_title("Simple Plot")
ax.legend()

Plotly (Express)

import plotly.express as px

Plotly categories

Plot categories

Histogram

px.histogram(data_frame=df, x="xColumn", title="My Title")

Stacked histogram (with more than one category)

px.histogram(df, x="xColumn", color="typeColumn", title="My Title")

Box Plot

px.box(df, y="columnName")

Stacked box plot (with more than one category)

px.box(df, y="dataColumn", x="categoryColumn")

Line Plot

px.line(df, x="date", y="price")

Stacked line plot

px.line(df, x="date", y="price", color="type")

Bar Plot

px.bar(df, x="price", y="region", orientation="h") # default: h(orizontal)
px.bar(df, x="region", y="price", orientation="v")

Stack bar plot

px.bar(df, x="price", y="region", color="type")

Stack side-by-side bar plot

px.bar(df, x="price", y="region", color="type", barmode="group")

Scatterplot

px.scatter(df, x="price", y="amount")

Enhancing plot styles

fig = px.bar(df_profit_per_city, 
            y="city", 
            x="profit", 
            orientation="h", 
            color="city",
            barmode='group',
            title="Plot Title",
            color_discrete_sequence=["red", "blue"]
      )

fig.update_layout(
    xaxis_title="profit (in €)",
    showlegend=False
    )

fig.show()



Geopandas

Resources

import numpy as np
import matplotlib.pyplot as plt #Plot
import pandas as pd	#DataFrames, Excel
import series as ser	#benummerte Columns
import geopandas as gpd

Python Mapping with GeoPandas

Data that comes with GeoPandas

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

Data structure

world.head()

Coordinate reference system (CRS)

world.crs
world.plot(color='grey', linewidth=0.5, edgecolor='white', figsize=(15,10))

Custom data (shape file)

import os
data_path = "../Data/"

cities = gpd.read_file(os.path.join(data_path, "ne_10m_populated_places.shp"))

cities.head()

cities.crs

cities.plot(figsize=(15,10), color='orange', markersize=5)

Join two geographical data sets

world.crs == cities.crs

base = world.plot(color='grey', linewidth=0.5, edgecolor='white', figsize=(15,10))
cities.plot(ax=base, color='orange', markersize=5)
base.set_axis_off()