In [1]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import r
#should work automatically i.e. no need for: df_long_r = pandas2ri.py2ri(df_long)
from rpy2.robjects import pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython
matplotlib.style.use('seaborn-notebook')
matplotlib.style.use('seaborn-ticks')
To be completed
In [2]:
import io
#df = pd.read_clipboard()
#http://lib.stat.cmu.edu/DASL/Stories/AlcoholandTobacco.html
s = """
Region Alcohol Tobacco
North 6.47 4.03
Yorkshire 6.13 3.76
Northeast 6.19 3.77
East Midlands 4.89 3.34
West Midlands 5.63 3.47
East Anglia 4.52 2.92
Southeast 5.89 3.20
Southwest 4.79 2.71
Wales 5.27 3.53
Scotland 6.08 4.51
Northern Ireland 4.02 4.56
"""
df = pd.read_table(io.StringIO(s))
In [3]:
plt.scatter('Tobacco', 'Alcohol', data=df, marker='o', alpha=.8,
s=300, c='burlywood', edgecolors='crimson', lw=1.5)
plt.grid()
plt.ylabel('Alcohol', fontsize=22)
plt.xlabel('Tobacco', fontsize=22)
Out[3]:
When using pandas you are missing Tab-Shift helpfull hints, but you gain few powerful options among which yerr and xerr, and table.
In [4]:
ax1 = plt.subplot2grid((1,3), (0,0), colspan=2)
ax2 = plt.subplot2grid((1,3), (0,2), colspan=1)
ax2.set_axis_off()
g = df.plot(ax=ax1, kind='scatter', x='Tobacco', y='Alcohol', fontsize=16,
s=300, marker='o', c='burlywood', edgecolors='crimson', lw=1.5,
alpha=.6, grid=True, title='using pandas',
yerr=.2, xerr=.11)
import pandas.plotting
tb = pandas.plotting.table(ax=ax2,data=np.round(df.describe(), 2), loc=4, colWidths=[0.4, 0.5, 0.5])
linear fit using statsmodels¶
In [5]:
df['ones'] = np.ones(len(df))
df
Out[5]:
In [6]:
import statsmodels.formula.api as sm
# removing the outlier
y = df.Alcohol[:-1]
x = df[['ones', 'Tobacco']][:-1]
#x = df.Tobacco[:-1]
#res = sm.OLS(y, x, hasconst=1).fit()
res = sm.OLS(y, x).fit()
res.summary()
Out[6]:
In [7]:
from statsmodels.formula.api import ols
lmout = ols('Alcohol ~ Tobacco', df[:-1]).fit()
lmout.summary()
Out[7]:
For a detailed explanation of these results look here.
Linear fit using sklearn¶
In [8]:
from sklearn.linear_model import LinearRegression
In [9]:
y = np.matrix(df)[:,1]
x = np.matrix(df)[:,2]
In [10]:
clean = LinearRegression()
total = LinearRegression()
clean.fit(x[:-1], y[:-1])
total.fit(x, y)
clean.score(x[:-1], y[:-1]), total.score(x, y)
Out[10]:
In [11]:
clean_score = '{0:.3f}'.format( clean.score( x[:-1], y[:-1] ) )
original_score = '{0:.3f}'.format( total.score( x, y ) )
In [12]:
#plt.scatter( df.Tobacco[:-1], df.Alcohol[:-1],
# marker='o', facecolors='none', edgecolors='b', s=110,
# label='All other regions,' +clean_score )
df[:-1].plot(kind='scatter', x='Tobacco', y='Alcohol',
marker='o', facecolors='none', edgecolors='b', s=110,
label='All other regions,' +clean_score,
title='Regression of Alcohol from Tobacco')
plt.scatter( df.Tobacco[-1:], df.Alcohol[-1:],
marker='s', color='r', s=110,
label='N. Ireland, outlier,' +original_score )
plt.legend(loc='lower center')
test = np.linspace(2.6, 4.85, 20)
test = np.array(np.matrix(test).T)
plt.plot(test, clean.predict(test), 'k-')
plt.plot(test, total.predict(test), 'r--')
Out[12]:
In [13]:
import scipy.stats
scipy.stats.linregress(df.Tobacco[:-1], df.Alcohol[:-1])
Out[13]:
Trellis plot: ggplot vs. seaborn¶
In [14]:
df0 = df[['Region', 'Alcohol', 'Tobacco']]
df_long = pd.melt(df0, id_vars='Region')
In [15]:
from ggplot import *
print( ggplot(df_long, aes(x='value')) +
geom_density() + facet_grid('variable')
)
In [16]:
import seaborn as sb
g = sb.FacetGrid(df_long, row="variable", hue='variable')
g.map(sb.kdeplot, "value")
Out[16]:
In [17]:
robjects.globalenv['df'] = df
robjects.globalenv['df_long'] = df_long
In [18]:
'''
%R plot(df)
r(""" plot(df) """)
'''
Out[18]:
In [19]:
%%R
lm = lm(df$Alcohol ~ df$Tobacco)
par(mfrow=c(2,2))
plot(lm)
In [20]:
%%R
library(ggplot2)
ggplot(df_long, aes(x='value')) + geom_density()
method 2: pythonic¶
In [21]:
gr = importr('grDevices')
stats = importr('stats')
graphics = importr('graphics')
base = importr('base')
lm = stats.lm('Alcohol ~ Tobacco', data=df)
gr
graphics.par(mfrow=base.c(2,2))
graphics.plot(lm)
#r['plot'](lm) #as a nice alternative
#input()
gr.dev_off()
Out[21]:
In [37]:
import rpy2.robjects.lib.ggplot2 as ggplot2
#gplot2 = importr('ggplot2')
gr
pp = ggplot2.ggplot(df) + ggplot2.geom_point(ggplot2.aes(x='Alcohol', y='Tobacco'))
#pp = ggplot2.gplot(df) + ggplot2.aes(x='Tobacco') + ggplot2.geom_density()
print(pp)
input()
gr.dev_off()
In [22]:
r('lmout <- lm(df$Alcohol ~ df$Tobacco)')
Out[22]:
In [23]:
def my_linear_fit_using_r(X,Y,verbose=True):
# ## FITTINGS: RPy implementation ###
r_correlation = robjects.r('function(x,y) cor.test(x,y)')
# r_quadfit = robjects.r('function(x,y) lm(y~I(x)+I(x^2))')
r_linfit = robjects.r('function(x,y) lm(y~x)')
r_get_r2=robjects.r('function(x) summary(x)$r.squared')
lin=r_linfit(robjects.FloatVector(X),robjects.FloatVector(Y))
coef_lin=robjects.r.coef(lin)
a=coef_lin[0]
b=coef_lin[1]
r2=r_get_r2(lin)
ci=robjects.r.confint(lin) # confidence intervals
lwr_a=ci[0]
lwr_b=ci[1]
upr_a=ci[2]
upr_b=ci[3]
if verbose:
print(robjects.r.summary(lin))
# print robjects.r.summary(quad)
return (a,b,r2[0],lwr_a,upr_a,lwr_b,upr_b)
In [36]:
linear_model = r.lm(r("y ~ x1"), data = r.data_frame(x1=x1,y=y))
rpy2.set_default_mode(rpy2.BASIC_CONVERSION)
print(linear_model.as_py()['coefficients'])
summary = r.summary(linear_model)
print(summary)