In [1]:

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import r
#should work automatically i.e. no need for: df_long_r = pandas2ri.py2ri(df_long)
from rpy2.robjects import pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython

matplotlib.style.use('seaborn-notebook')
matplotlib.style.use('seaborn-ticks')

To be completed

In [2]:

import io

#df = pd.read_clipboard()
#http://lib.stat.cmu.edu/DASL/Stories/AlcoholandTobacco.html
s = """
Region	Alcohol	Tobacco
North	6.47	4.03
Yorkshire	6.13	3.76
Northeast	6.19	3.77
East Midlands	4.89	3.34
West Midlands	5.63	3.47
East Anglia	4.52	2.92
Southeast	5.89	3.20
Southwest	4.79	2.71
Wales	5.27	3.53
Scotland	6.08	4.51
Northern Ireland	4.02	4.56
"""
df = pd.read_table(io.StringIO(s))

In [3]:

plt.scatter('Tobacco', 'Alcohol', data=df, marker='o', alpha=.8,
            s=300, c='burlywood', edgecolors='crimson', lw=1.5)
plt.grid()
plt.ylabel('Alcohol', fontsize=22)
plt.xlabel('Tobacco', fontsize=22)

Out[3]:

Text(0.5,0,'Tobacco')

When using pandas you are missing Tab-Shift helpfull hints, but you gain few powerful options among which yerr and xerr, and table.

In [4]:

ax1 = plt.subplot2grid((1,3), (0,0), colspan=2)
ax2 = plt.subplot2grid((1,3), (0,2), colspan=1)
ax2.set_axis_off()

g = df.plot(ax=ax1, kind='scatter', x='Tobacco', y='Alcohol', fontsize=16,
        s=300, marker='o', c='burlywood', edgecolors='crimson', lw=1.5, 
        alpha=.6, grid=True, title='using pandas',
        yerr=.2, xerr=.11)

import pandas.plotting
tb = pandas.plotting.table(ax=ax2,data=np.round(df.describe(), 2), loc=4, colWidths=[0.4, 0.5, 0.5])

linear fit using statsmodels¶

In [5]:

df['ones'] = np.ones(len(df))
df

Out[5]:

	Region	Alcohol	Tobacco	ones
0	North	6.47	4.03	1.0
1	Yorkshire	6.13	3.76	1.0
2	Northeast	6.19	3.77	1.0
3	East Midlands	4.89	3.34	1.0
4	West Midlands	5.63	3.47	1.0
5	East Anglia	4.52	2.92	1.0
6	Southeast	5.89	3.20	1.0
7	Southwest	4.79	2.71	1.0
8	Wales	5.27	3.53	1.0
9	Scotland	6.08	4.51	1.0
10	Northern Ireland	4.02	4.56	1.0

In [6]:

import statsmodels.formula.api as sm

# removing the outlier
y = df.Alcohol[:-1]
x = df[['ones', 'Tobacco']][:-1]
#x = df.Tobacco[:-1]
#res = sm.OLS(y, x, hasconst=1).fit()
res = sm.OLS(y, x).fit()
res.summary()

/home/dan/.local/lib/python3.6/site-packages/scipy/stats/stats.py:1390: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10
  "anyway, n=%i" % int(n))

Out[6]:

OLS Regression Results
Dep. Variable:	Alcohol	R-squared:	0.615
Model:	OLS	Adj. R-squared:	0.567
Method:	Least Squares	F-statistic:	12.78
Date:	Wed, 22 Nov 2017	Prob (F-statistic):	0.00723
Time:	12:09:15	Log-Likelihood:	-4.9998
No. Observations:	10	AIC:	14.00
Df Residuals:	8	BIC:	14.60
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
ones	2.0412	1.001	2.038	0.076	-0.268	4.350
Tobacco	1.0059	0.281	3.576	0.007	0.357	1.655

Omnibus:	2.542	Durbin-Watson:	1.975
Prob(Omnibus):	0.281	Jarque-Bera (JB):	0.904
Skew:	-0.014	Prob(JB):	0.636
Kurtosis:	1.527	Cond. No.	27.2

In [7]:

from statsmodels.formula.api import ols

lmout = ols('Alcohol ~ Tobacco', df[:-1]).fit()
lmout.summary()

/home/dan/.local/lib/python3.6/site-packages/scipy/stats/stats.py:1390: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10
  "anyway, n=%i" % int(n))

Out[7]:

OLS Regression Results
Dep. Variable:	Alcohol	R-squared:	0.615
Model:	OLS	Adj. R-squared:	0.567
Method:	Least Squares	F-statistic:	12.78
Date:	Wed, 22 Nov 2017	Prob (F-statistic):	0.00723
Time:	12:09:15	Log-Likelihood:	-4.9998
No. Observations:	10	AIC:	14.00
Df Residuals:	8	BIC:	14.60
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	2.0412	1.001	2.038	0.076	-0.268	4.350
Tobacco	1.0059	0.281	3.576	0.007	0.357	1.655

Omnibus:	2.542	Durbin-Watson:	1.975
Prob(Omnibus):	0.281	Jarque-Bera (JB):	0.904
Skew:	-0.014	Prob(JB):	0.636
Kurtosis:	1.527	Cond. No.	27.2

For a detailed explanation of these results look here.

Linear fit using sklearn¶

In [8]:

from sklearn.linear_model import LinearRegression

In [9]:

y = np.matrix(df)[:,1]
x = np.matrix(df)[:,2]

In [10]:

clean = LinearRegression()
total = LinearRegression()
clean.fit(x[:-1], y[:-1])
total.fit(x, y)
clean.score(x[:-1], y[:-1]), total.score(x, y)

Out[10]:

(0.61510660055413524, 0.049984490275869264)

In [11]:

clean_score = '{0:.3f}'.format( clean.score( x[:-1], y[:-1] ) )
original_score = '{0:.3f}'.format( total.score( x, y ) )

In [12]:

#plt.scatter( df.Tobacco[:-1], df.Alcohol[:-1],
#         marker='o', facecolors='none', edgecolors='b', s=110,
#         label='All other regions,' +clean_score )
df[:-1].plot(kind='scatter', x='Tobacco', y='Alcohol',
             marker='o', facecolors='none', edgecolors='b', s=110,
             label='All other regions,' +clean_score,
             title='Regression of Alcohol from Tobacco')

plt.scatter( df.Tobacco[-1:], df.Alcohol[-1:],
         marker='s', color='r', s=110,
         label='N. Ireland, outlier,' +original_score )
plt.legend(loc='lower center')

test = np.linspace(2.6, 4.85, 20)
test = np.array(np.matrix(test).T)

plt.plot(test, clean.predict(test), 'k-')
plt.plot(test, total.predict(test), 'r--')

Out[12]:

[]

In [13]:

import scipy.stats
scipy.stats.linregress(df.Tobacco[:-1], df.Alcohol[:-1])

Out[13]:

LinregressResult(slope=1.0058958323387601, intercept=2.0412230868382095, rvalue=0.7842873201538676, pvalue=0.007234358351729509, stderr=0.28132158350454367)

Trellis plot: ggplot vs. seaborn¶

In [14]:

df0 = df[['Region', 'Alcohol', 'Tobacco']]
df_long = pd.melt(df0, id_vars='Region')

In [15]:

from ggplot import *

print( ggplot(df_long, aes(x='value')) + 
      geom_density() + facet_grid('variable')
)

/home/dan/.local/lib/python3.6/site-packages/ggplot/utils.py:81: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
/home/dan/.local/lib/python3.6/site-packages/ggplot/stats/smoothers.py:4: FutureWarning: The pandas.lib module is deprecated and will be removed in a future version. These are private functions and can be accessed from pandas._libs.lib instead
  from pandas.lib import Timestamp
/home/dan/.local/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [16]:

import seaborn as sb
g = sb.FacetGrid(df_long, row="variable", hue='variable')
g.map(sb.kdeplot, "value")

Out[16]:

and rpy2¶

It is probably going through active development with continuosly changing best practices. However, I found two pricipal methods for using it.

method 1: sending variables to R (globalenv)¶

In [17]:

robjects.globalenv['df'] = df
robjects.globalenv['df_long'] = df_long

In [18]:

'''
    %R plot(df)

    r(""" plot(df) """)
'''

Out[18]:

'\n    %R plot(df)\n\n    r(""" plot(df) """)\n'

In [19]:

%%R
lm = lm(df$Alcohol ~ df$Tobacco)
par(mfrow=c(2,2))
plot(lm)

In [20]:

%%R
library(ggplot2)
ggplot(df_long, aes(x='value')) + geom_density()

method 2: pythonic¶

In [21]:

gr = importr('grDevices')
stats = importr('stats')
graphics = importr('graphics')
base = importr('base')
lm = stats.lm('Alcohol ~ Tobacco', data=df)
gr
graphics.par(mfrow=base.c(2,2))
graphics.plot(lm)
#r['plot'](lm) #as a nice alternative
#input()
gr.dev_off()

Out[21]:

IntVector with 1 elements.

1

In [37]:

import rpy2.robjects.lib.ggplot2 as ggplot2
#gplot2 = importr('ggplot2')
gr
pp = ggplot2.ggplot(df) + ggplot2.geom_point(ggplot2.aes(x='Alcohol', y='Tobacco'))
#pp = ggplot2.gplot(df) + ggplot2.aes(x='Tobacco') + ggplot2.geom_density()
print(pp)
input()
gr.dev_off()

---------------------------------------------------------------------------
RRuntimeError                             Traceback (most recent call last)
 in ()
      4 pp = ggplot2.ggplot(df) + ggplot2.geom_point(ggplot2.aes(x='Alcohol', y='Tobacco'))
      5 #pp = ggplot2.gplot(df) + ggplot2.aes(x='Tobacco') + ggplot2.geom_density()
----> 6 print(pp)
      7 input()
      8 gr.dev_off()

~/.local/lib/python3.6/site-packages/rpy2/robjects/robject.py in __str__(self)
    101                 s.append(x)
    102             rpy2.rinterface.set_writeconsole_regular(f)
--> 103         self.__show(self)
    104         if sys.platform == 'win32':
    105             self.__sink()

RRuntimeError: Error in FUN(X[[i]], ...) : Unknown input:expression

In [22]:

r('lmout <- lm(df$Alcohol ~ df$Tobacco)')

Out[22]:

ListVector with 12 elements.

coefficients

FloatVector with 2 elements.

4.351168

0.301938

residuals

FloatVector with 11 elements.

0.902020

0.643543

0.700524

-0.469643

...

-0.379421

-0.147011

0.367090

-1.708007

effects

FloatVector with 11 elements.

-18.054499

-0.564016

0.478753

-0.593801

...

-0.360566

-0.314300

-0.022665

-2.109112

...

call

Vector with 2 elements.

RObject

Vector

terms

model

R/rpy2 DataFrame (11 x 2)

df$Alcohol	df$Tobacco
6.470000	4.030000
6.130000	3.760000

In [23]:

 
def my_linear_fit_using_r(X,Y,verbose=True):
   # ## FITTINGS:   RPy implementation ###
   r_correlation = robjects.r('function(x,y) cor.test(x,y)')
   # r_quadfit = robjects.r('function(x,y) lm(y~I(x)+I(x^2))')
   r_linfit = robjects.r('function(x,y) lm(y~x)')
   r_get_r2=robjects.r('function(x) summary(x)$r.squared')
   lin=r_linfit(robjects.FloatVector(X),robjects.FloatVector(Y))
   coef_lin=robjects.r.coef(lin)
   a=coef_lin[0]
   b=coef_lin[1]
   r2=r_get_r2(lin)
   ci=robjects.r.confint(lin) # confidence intervals
   lwr_a=ci[0]
   lwr_b=ci[1]
   upr_a=ci[2]
   upr_b=ci[3]
   if verbose:
      print(robjects.r.summary(lin))
      # print robjects.r.summary(quad)
   return (a,b,r2[0],lwr_a,upr_a,lwr_b,upr_b)

In [36]:

linear_model = r.lm(r("y ~ x1"), data = r.data_frame(x1=x1,y=y))

rpy2.set_default_mode(rpy2.BASIC_CONVERSION)
print(linear_model.as_py()['coefficients'])
summary = r.summary(linear_model)
print(summary)

---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
~/.local/lib/python3.6/site-packages/rpy2/robjects/__init__.py in __getattribute__(self, attr)
    325         try:
--> 326             return self.__getitem__(attr)
    327         except LookupError as le:

~/.local/lib/python3.6/site-packages/rpy2/robjects/__init__.py in __getitem__(self, item)
    330     def __getitem__(self, item):
--> 331         res = _globalenv.get(item)
    332         res = conversion.ri2py(res)

LookupError: 'data_frame' not found

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
 in ()
----> 1 linear_model = r.lm(r("y ~ x1"), data = r.data_frame(x1=x1,y=y))
      2 
      3 rpy2.set_default_mode(rpy2.BASIC_CONVERSION)
      4 print(linear_model.as_py()['coefficients'])
      5 summary = r.summary(linear_model)

~/.local/lib/python3.6/site-packages/rpy2/robjects/__init__.py in __getattribute__(self, attr)
    326             return self.__getitem__(attr)
    327         except LookupError as le:
--> 328             raise AttributeError(orig_ae)
    329 
    330     def __getitem__(self, item):

AttributeError: 'R' object has no attribute 'data_frame'

datan

Rpy linear fit