Wir untersuchen Einkommensdaten (gegeben in data/wage.csv.gz
) mit Hilfe von
Questions:
# some imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
sns.set_style('whitegrid')
warnings.filterwarnings('ignore')
%matplotlib inline
# if the file were not gzipped, we could now take a look at its content via
# cat data/wage.csv
df = pd.read_csv('data/wage.csv.gz', index_col=0)
df.head(3)
df.columns
df.describe()[['age', 'wage']]
df['sex'].unique()
fig, ax = plt.subplots(figsize=(10, 6))
df.plot.scatter(x='age', y='wage', title='wage ~ age', ax=ax)
median_wage = df.groupby('age')[['wage']].agg(np.median)
median_wage.head()
fig, ax = plt.subplots(figsize=(10, 6))
df.plot.scatter(x='age', y='wage', title='wage ~ age', label='data', ax=ax)
median_wage.plot.line(label='median', linewidth=4, color='r', ax=ax)
ax.legend()
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(y='wage', x='education', data=df,
order=sorted(df['education'].unique()), ax=ax)
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(y='wage', x='education', hue='race', data=df,
order=sorted(df['education'].unique()),
hue_order=sorted(df['race'].unique()),
ax=ax)
sns.set(style="ticks")
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(y='wage', x='education', hue='race', data=df,
order=sorted(df['education'].unique()),
hue_order=sorted(df['race'].unique()),
ax=ax)
sns.despine(offset=10, trim=True)
df.groupby(['education', 'race'])[['wage']].agg(np.median)
df_res = df.groupby(['education', 'race'])[['wage']].agg(np.median)
df_res.to_excel('great_insight.xlsx')