datan

Rpy linear fit

In [1]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import r
#should work automatically i.e. no need for: df_long_r = pandas2ri.py2ri(df_long)
from rpy2.robjects import pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython

matplotlib.style.use('seaborn-notebook')
matplotlib.style.use('seaborn-ticks')

To be completed

In [2]:
import io

#df = pd.read_clipboard()
#http://lib.stat.cmu.edu/DASL/Stories/AlcoholandTobacco.html
s = """
Region	Alcohol	Tobacco
North	6.47	4.03
Yorkshire	6.13	3.76
Northeast	6.19	3.77
East Midlands	4.89	3.34
West Midlands	5.63	3.47
East Anglia	4.52	2.92
Southeast	5.89	3.20
Southwest	4.79	2.71
Wales	5.27	3.53
Scotland	6.08	4.51
Northern Ireland	4.02	4.56
"""
df = pd.read_table(io.StringIO(s))
In [3]:
plt.scatter('Tobacco', 'Alcohol', data=df, marker='o', alpha=.8,
            s=300, c='burlywood', edgecolors='crimson', lw=1.5)
plt.grid()
plt.ylabel('Alcohol', fontsize=22)
plt.xlabel('Tobacco', fontsize=22)
Out[3]:
Text(0.5,0,'Tobacco')

When using pandas you are missing Tab-Shift helpfull hints, but you gain few powerful options among which yerr and xerr, and table.

In [4]:
ax1 = plt.subplot2grid((1,3), (0,0), colspan=2)
ax2 = plt.subplot2grid((1,3), (0,2), colspan=1)
ax2.set_axis_off()

g = df.plot(ax=ax1, kind='scatter', x='Tobacco', y='Alcohol', fontsize=16,
        s=300, marker='o', c='burlywood', edgecolors='crimson', lw=1.5, 
        alpha=.6, grid=True, title='using pandas',
        yerr=.2, xerr=.11)

import pandas.plotting
tb = pandas.plotting.table(ax=ax2,data=np.round(df.describe(), 2), loc=4, colWidths=[0.4, 0.5, 0.5])

linear fit using statsmodels

In [5]:
df['ones'] = np.ones(len(df))
df
Out[5]:
Region Alcohol Tobacco ones
0 North 6.47 4.03 1.0
1 Yorkshire 6.13 3.76 1.0
2 Northeast 6.19 3.77 1.0
3 East Midlands 4.89 3.34 1.0
4 West Midlands 5.63 3.47 1.0
5 East Anglia 4.52 2.92 1.0
6 Southeast 5.89 3.20 1.0
7 Southwest 4.79 2.71 1.0
8 Wales 5.27 3.53 1.0
9 Scotland 6.08 4.51 1.0
10 Northern Ireland 4.02 4.56 1.0
In [6]:
import statsmodels.formula.api as sm

# removing the outlier
y = df.Alcohol[:-1]
x = df[['ones', 'Tobacco']][:-1]
#x = df.Tobacco[:-1]
#res = sm.OLS(y, x, hasconst=1).fit()
res = sm.OLS(y, x).fit()
res.summary()
/home/dan/.local/lib/python3.6/site-packages/scipy/stats/stats.py:1390: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10
  "anyway, n=%i" % int(n))
Out[6]:
OLS Regression Results
Dep. Variable: Alcohol R-squared: 0.615
Model: OLS Adj. R-squared: 0.567
Method: Least Squares F-statistic: 12.78
Date: Wed, 22 Nov 2017 Prob (F-statistic): 0.00723
Time: 12:09:15 Log-Likelihood: -4.9998
No. Observations: 10 AIC: 14.00
Df Residuals: 8 BIC: 14.60
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
ones 2.0412 1.001 2.038 0.076 -0.268 4.350
Tobacco 1.0059 0.281 3.576 0.007 0.357 1.655
Omnibus: 2.542 Durbin-Watson: 1.975
Prob(Omnibus): 0.281 Jarque-Bera (JB): 0.904
Skew: -0.014 Prob(JB): 0.636
Kurtosis: 1.527 Cond. No. 27.2
In [7]:
from statsmodels.formula.api import ols

lmout = ols('Alcohol ~ Tobacco', df[:-1]).fit()
lmout.summary()
/home/dan/.local/lib/python3.6/site-packages/scipy/stats/stats.py:1390: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10
  "anyway, n=%i" % int(n))
Out[7]:
OLS Regression Results
Dep. Variable: Alcohol R-squared: 0.615
Model: OLS Adj. R-squared: 0.567
Method: Least Squares F-statistic: 12.78
Date: Wed, 22 Nov 2017 Prob (F-statistic): 0.00723
Time: 12:09:15 Log-Likelihood: -4.9998
No. Observations: 10 AIC: 14.00
Df Residuals: 8 BIC: 14.60
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 2.0412 1.001 2.038 0.076 -0.268 4.350
Tobacco 1.0059 0.281 3.576 0.007 0.357 1.655
Omnibus: 2.542 Durbin-Watson: 1.975
Prob(Omnibus): 0.281 Jarque-Bera (JB): 0.904
Skew: -0.014 Prob(JB): 0.636
Kurtosis: 1.527 Cond. No. 27.2

For a detailed explanation of these results look here.

Linear fit using sklearn

In [8]:
from sklearn.linear_model import LinearRegression
In [9]:
y = np.matrix(df)[:,1]
x = np.matrix(df)[:,2]
In [10]:
clean = LinearRegression()
total = LinearRegression()
clean.fit(x[:-1], y[:-1])
total.fit(x, y)
clean.score(x[:-1], y[:-1]), total.score(x, y)
Out[10]:
(0.61510660055413524, 0.049984490275869264)
In [11]:
clean_score = '{0:.3f}'.format( clean.score( x[:-1], y[:-1] ) )
original_score = '{0:.3f}'.format( total.score( x, y ) )
In [12]:
#plt.scatter( df.Tobacco[:-1], df.Alcohol[:-1],
#         marker='o', facecolors='none', edgecolors='b', s=110,
#         label='All other regions,' +clean_score )
df[:-1].plot(kind='scatter', x='Tobacco', y='Alcohol',
             marker='o', facecolors='none', edgecolors='b', s=110,
             label='All other regions,' +clean_score,
             title='Regression of Alcohol from Tobacco')

plt.scatter( df.Tobacco[-1:], df.Alcohol[-1:],
         marker='s', color='r', s=110,
         label='N. Ireland, outlier,' +original_score )
plt.legend(loc='lower center')

test = np.linspace(2.6, 4.85, 20)
test = np.array(np.matrix(test).T)

plt.plot(test, clean.predict(test), 'k-')
plt.plot(test, total.predict(test), 'r--')
Out[12]:
[]
In [13]:
import scipy.stats
scipy.stats.linregress(df.Tobacco[:-1], df.Alcohol[:-1])
Out[13]:
LinregressResult(slope=1.0058958323387601, intercept=2.0412230868382095, rvalue=0.7842873201538676, pvalue=0.007234358351729509, stderr=0.28132158350454367)

Trellis plot: ggplot vs. seaborn

In [14]:
df0 = df[['Region', 'Alcohol', 'Tobacco']]
df_long = pd.melt(df0, id_vars='Region')
In [15]:
from ggplot import *

print( ggplot(df_long, aes(x='value')) + 
      geom_density() + facet_grid('variable')
)
/home/dan/.local/lib/python3.6/site-packages/ggplot/utils.py:81: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
/home/dan/.local/lib/python3.6/site-packages/ggplot/stats/smoothers.py:4: FutureWarning: The pandas.lib module is deprecated and will be removed in a future version. These are private functions and can be accessed from pandas._libs.lib instead
  from pandas.lib import Timestamp
/home/dan/.local/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [16]:
import seaborn as sb
g = sb.FacetGrid(df_long, row="variable", hue='variable')
g.map(sb.kdeplot, "value")
Out[16]:

and rpy2

It is probably going through active development with continuosly changing best practices. However, I found two pricipal methods for using it.

method 1: sending variables to R (globalenv)

In [17]:
robjects.globalenv['df'] = df
robjects.globalenv['df_long'] = df_long
In [18]:
'''
    %R plot(df)

    r(""" plot(df) """)
'''
Out[18]:
'\n    %R plot(df)\n\n    r(""" plot(df) """)\n'
In [19]:
%%R
lm = lm(df$Alcohol ~ df$Tobacco)
par(mfrow=c(2,2))
plot(lm)
In [20]:
%%R
library(ggplot2)
ggplot(df_long, aes(x='value')) + geom_density()

method 2: pythonic

In [21]:
gr = importr('grDevices')
stats = importr('stats')
graphics = importr('graphics')
base = importr('base')
lm = stats.lm('Alcohol ~ Tobacco', data=df)
gr
graphics.par(mfrow=base.c(2,2))
graphics.plot(lm)
#r['plot'](lm) #as a nice alternative
#input()
gr.dev_off()
Out[21]:
IntVector with 1 elements.
1
In [37]:
import rpy2.robjects.lib.ggplot2 as ggplot2
#gplot2 = importr('ggplot2')
gr
pp = ggplot2.ggplot(df) + ggplot2.geom_point(ggplot2.aes(x='Alcohol', y='Tobacco'))
#pp = ggplot2.gplot(df) + ggplot2.aes(x='Tobacco') + ggplot2.geom_density()
print(pp)
input()
gr.dev_off()
---------------------------------------------------------------------------
RRuntimeError                             Traceback (most recent call last)
 in ()
      4 pp = ggplot2.ggplot(df) + ggplot2.geom_point(ggplot2.aes(x='Alcohol', y='Tobacco'))
      5 #pp = ggplot2.gplot(df) + ggplot2.aes(x='Tobacco') + ggplot2.geom_density()
----> 6 print(pp)
      7 input()
      8 gr.dev_off()

~/.local/lib/python3.6/site-packages/rpy2/robjects/robject.py in __str__(self)
    101                 s.append(x)
    102             rpy2.rinterface.set_writeconsole_regular(f)
--> 103         self.__show(self)
    104         if sys.platform == 'win32':
    105             self.__sink()

RRuntimeError: Error in FUN(X[[i]], ...) : Unknown input:expression
In [22]:
r('lmout <- lm(df$Alcohol ~ df$Tobacco)')
Out[22]:
ListVector with 12 elements.
coefficients FloatVector with 2 elements.
4.351168 0.301938
residuals FloatVector with 11 elements.
0.902020 0.643543 0.700524 -0.469643 ... -0.379421 -0.147011 0.367090 -1.708007
effects FloatVector with 11 elements.
-18.054499 -0.564016 0.478753 -0.593801 ... -0.360566 -0.314300 -0.022665 -2.109112
... ...
call Vector with 2 elements.
RObject Vector
terms
model R/rpy2 DataFrame (11 x 2)
df$Alcohol df$Tobacco
6.470000 4.030000
6.130000 3.760000
In [23]:
 
def my_linear_fit_using_r(X,Y,verbose=True):
   # ## FITTINGS:   RPy implementation ###
   r_correlation = robjects.r('function(x,y) cor.test(x,y)')
   # r_quadfit = robjects.r('function(x,y) lm(y~I(x)+I(x^2))')
   r_linfit = robjects.r('function(x,y) lm(y~x)')
   r_get_r2=robjects.r('function(x) summary(x)$r.squared')
   lin=r_linfit(robjects.FloatVector(X),robjects.FloatVector(Y))
   coef_lin=robjects.r.coef(lin)
   a=coef_lin[0]
   b=coef_lin[1]
   r2=r_get_r2(lin)
   ci=robjects.r.confint(lin) # confidence intervals
   lwr_a=ci[0]
   lwr_b=ci[1]
   upr_a=ci[2]
   upr_b=ci[3]
   if verbose:
      print(robjects.r.summary(lin))
      # print robjects.r.summary(quad)
   return (a,b,r2[0],lwr_a,upr_a,lwr_b,upr_b)
In [36]:
linear_model = r.lm(r("y ~ x1"), data = r.data_frame(x1=x1,y=y))

rpy2.set_default_mode(rpy2.BASIC_CONVERSION)
print(linear_model.as_py()['coefficients'])
summary = r.summary(linear_model)
print(summary)
---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
~/.local/lib/python3.6/site-packages/rpy2/robjects/__init__.py in __getattribute__(self, attr)
    325         try:
--> 326             return self.__getitem__(attr)
    327         except LookupError as le:

~/.local/lib/python3.6/site-packages/rpy2/robjects/__init__.py in __getitem__(self, item)
    330     def __getitem__(self, item):
--> 331         res = _globalenv.get(item)
    332         res = conversion.ri2py(res)

LookupError: 'data_frame' not found

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
 in ()
----> 1 linear_model = r.lm(r("y ~ x1"), data = r.data_frame(x1=x1,y=y))
      2 
      3 rpy2.set_default_mode(rpy2.BASIC_CONVERSION)
      4 print(linear_model.as_py()['coefficients'])
      5 summary = r.summary(linear_model)

~/.local/lib/python3.6/site-packages/rpy2/robjects/__init__.py in __getattribute__(self, attr)
    326             return self.__getitem__(attr)
    327         except LookupError as le:
--> 328             raise AttributeError(orig_ae)
    329 
    330     def __getitem__(self, item):

AttributeError: 'R' object has no attribute 'data_frame'