M-Estimators for Robust Linear Modeling

In [1]:
%matplotlib inline

from __future__ import print_function
from statsmodels.compat import lmap
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

import statsmodels.api as sm
/build/statsmodels-Lrgq1G/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
  • An M-estimator minimizes the function

$$Q(e_i, \rho) = \sum_i~\rho \left (\frac{e_i}{s}\right )$$

where $\rho$ is a symmetric function of the residuals

  • The effect of $\rho$ is to reduce the influence of outliers
  • $s$ is an estimate of scale.
  • The robust estimates $\hat{\beta}$ are computed by the iteratively re-weighted least squares algorithm
  • We have several choices available for the weighting functions to be used
In [2]:
norms = sm.robust.norms
In [3]:
def plot_weights(support, weights_func, xlabels, xticks):
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    ax.plot(support, weights_func(support))
    ax.set_xticks(xticks)
    ax.set_xticklabels(xlabels, fontsize=16)
    ax.set_ylim(-.1, 1.1)
    return ax

Andrew's Wave

In [4]:
help(norms.AndrewWave.weights)
Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Andrew's wave weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = sin(z/a)/(z/a)     for \|z\| <= a*pi
    
        weights(z) = 0                  for \|z\| > a*pi

In [5]:
a = 1.339
support = np.linspace(-np.pi*a, np.pi*a, 100)
andrew = norms.AndrewWave(a=a)
plot_weights(support, andrew.weights, ['$-\pi*a$', '0', '$\pi*a$'], [-np.pi*a, 0, np.pi*a]);

Hampel's 17A

In [6]:
help(norms.Hampel.weights)
Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Hampel weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = 1                            for \|z\| <= a
    
        weights(z) = a/\|z\|                        for a < \|z\| <= b
    
        weights(z) = a*(c - \|z\|)/(\|z\|*(c-b))      for b < \|z\| <= c
    
        weights(z) = 0                            for \|z\| > c

In [7]:
c = 8
support = np.linspace(-3*c, 3*c, 1000)
hampel = norms.Hampel(a=2., b=4., c=c)
plot_weights(support, hampel.weights, ['3*c', '0', '3*c'], [-3*c, 0, 3*c]);

Huber's t

In [8]:
help(norms.HuberT.weights)
Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Huber's t weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = 1          for \|z\| <= t
    
        weights(z) = t/\|z\|      for \|z\| > t

In [9]:
t = 1.345
support = np.linspace(-3*t, 3*t, 1000)
huber = norms.HuberT(t=t)
plot_weights(support, huber.weights, ['-3*t', '0', '3*t'], [-3*t, 0, 3*t]);

Least Squares

In [10]:
help(norms.LeastSquares.weights)
Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    The least squares estimator weighting function for the IRLS algorithm.
    
    The psi function scaled by the input z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = np.ones(z.shape)

In [11]:
support = np.linspace(-3, 3, 1000)
lst_sq = norms.LeastSquares()
plot_weights(support, lst_sq.weights, ['-3', '0', '3'], [-3, 0, 3]);

Ramsay's Ea

In [12]:
help(norms.RamsayE.weights)
Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Ramsay's Ea weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = exp(-a*\|z\|)

In [13]:
a = .3
support = np.linspace(-3*a, 3*a, 1000)
ramsay = norms.RamsayE(a=a)
plot_weights(support, ramsay.weights, ['-3*a', '0', '3*a'], [-3*a, 0, 3*a]);

Trimmed Mean

In [14]:
help(norms.TrimmedMean.weights)
Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Least trimmed mean weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        weights(z) = 1             for \|z\| <= c
    
        weights(z) = 0             for \|z\| > c

In [15]:
c = 2
support = np.linspace(-3*c, 3*c, 1000)
trimmed = norms.TrimmedMean(c=c)
plot_weights(support, trimmed.weights, ['-3*c', '0', '3*c'], [-3*c, 0, 3*c]);

Tukey's Biweight

In [16]:
help(norms.TukeyBiweight.weights)
Help on function weights in module statsmodels.robust.norms:

weights(self, z)
    Tukey's biweight weighting function for the IRLS algorithm
    
    The psi function scaled by z
    
    Parameters
    ----------
    z : array-like
        1d array
    
    Returns
    -------
    weights : array
        psi(z) = (1 - (z/c)**2)**2          for \|z\| <= R
    
        psi(z) = 0                          for \|z\| > R

In [17]:
c = 4.685
support = np.linspace(-3*c, 3*c, 1000)
tukey = norms.TukeyBiweight(c=c)
plot_weights(support, tukey.weights, ['-3*c', '0', '3*c'], [-3*c, 0, 3*c]);

Scale Estimators

  • Robust estimates of the location
In [18]:
x = np.array([1, 2, 3, 4, 500])
  • The mean is not a robust estimator of location
In [19]:
x.mean()
Out[19]:
102.0
  • The median, on the other hand, is a robust estimator with a breakdown point of 50%
In [20]:
np.median(x)
Out[20]:
3.0
  • Analagously for the scale
  • The standard deviation is not robust
In [21]:
x.std()
Out[21]:
199.00251254695254

Median Absolute Deviation

$$ median_i |X_i - median_j(X_j)|) $$

Standardized Median Absolute Deviation is a consistent estimator for $\hat{\sigma}$

$$\hat{\sigma}=K \cdot MAD$$

where $K$ depends on the distribution. For the normal distribution for example,

$$K = \Phi^{-1}(.75)$$

In [22]:
stats.norm.ppf(.75)
Out[22]:
0.6744897501960817
In [23]:
print(x)
[  1   2   3   4 500]
In [24]:
sm.robust.scale.stand_mad(x)
/build/statsmodels-Lrgq1G/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/robust/scale.py:49: FutureWarning: stand_mad is deprecated and will be removed in 0.7.0. Use mad instead.
  "instead.", FutureWarning)
Out[24]:
1.482602218505602
In [25]:
np.array([1,2,3,4,5.]).std()
Out[25]:
1.4142135623730951
  • The default for Robust Linear Models is MAD
  • another popular choice is Huber's proposal 2
In [26]:
np.random.seed(12345)
fat_tails = stats.t(6).rvs(40)
In [27]:
kde = sm.nonparametric.KDEUnivariate(fat_tails)
kde.fit()
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax.plot(kde.support, kde.density);
In [28]:
print(fat_tails.mean(), fat_tails.std())
0.0688231044810875 1.3471633229698652
In [29]:
print(stats.norm.fit(fat_tails))
(0.0688231044810875, 1.3471633229698652)
In [30]:
print(stats.t.fit(fat_tails, f0=6))
(6, 0.03900918717027818, 1.0564230978488927)
In [31]:
huber = sm.robust.scale.Huber()
loc, scale = huber(fat_tails)
print(loc, scale)
0.04048984333271795 1.1557140047569665
In [32]:
sm.robust.stand_mad(fat_tails)
/build/statsmodels-Lrgq1G/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/robust/scale.py:49: FutureWarning: stand_mad is deprecated and will be removed in 0.7.0. Use mad instead.
  "instead.", FutureWarning)
Out[32]:
1.115335001165415
In [33]:
sm.robust.stand_mad(fat_tails, c=stats.t(6).ppf(.75))
/build/statsmodels-Lrgq1G/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/robust/scale.py:49: FutureWarning: stand_mad is deprecated and will be removed in 0.7.0. Use mad instead.
  "instead.", FutureWarning)
Out[33]:
1.0483916565928972
In [34]:
sm.robust.scale.mad(fat_tails)
Out[34]:
1.115335001165415

Duncan's Occupational Prestige data - M-estimation for outliers

In [35]:
from statsmodels.graphics.api import abline_plot
from statsmodels.formula.api import ols, rlm
In [36]:
prestige = sm.datasets.get_rdataset("Duncan", "car", cache=True).data
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
/build/statsmodels-Lrgq1G/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/datasets/utils.py in _get_data(base_url, dataname, cache, extension)
    220     try:
--> 221         data, from_cache = _urlopen_cached(url, cache)
    222     except HTTPError as err:

/build/statsmodels-Lrgq1G/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/datasets/utils.py in _urlopen_cached(url, cache)
    211     if not from_cache:
--> 212         data = urlopen(url).read()
    213         if cache is not None:  # then put it in the cache

/usr/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 

/usr/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 

/usr/lib/python3.6/urllib/request.py in http_response(self, request, response)
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 

/usr/lib/python3.6/urllib/request.py in error(self, proto, *args)
    563         args = (dict, proto, meth_name) + args
--> 564         result = self._call_chain(*args)
    565         if result:

/usr/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:

/usr/lib/python3.6/urllib/request.py in http_error_302(self, req, fp, code, msg, headers)
    755 
--> 756         return self.parent.open(new, timeout=req.timeout)
    757 

/usr/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 

/usr/lib/python3.6/urllib/request.py in http_response(self, request, response)
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 

/usr/lib/python3.6/urllib/request.py in error(self, proto, *args)
    569             args = (dict, 'default', 'http_error_default') + orig_args
--> 570             return self._call_chain(*args)
    571 

/usr/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:

/usr/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    649     def http_error_default(self, req, fp, code, msg, hdrs):
--> 650         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 

HTTPError: HTTP Error 404: Not Found

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-36-b71db8dae743> in <module>()
----> 1 prestige = sm.datasets.get_rdataset("Duncan", "car", cache=True).data

/build/statsmodels-Lrgq1G/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/datasets/utils.py in get_rdataset(dataname, package, cache)
    288                      "master/doc/"+package+"/rst/")
    289     cache = _get_cache(cache)
--> 290     data, from_cache = _get_data(data_base_url, dataname, cache)
    291     data = read_csv(data, index_col=0)
    292     data = _maybe_reset_index(data)

/build/statsmodels-Lrgq1G/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/datasets/utils.py in _get_data(base_url, dataname, cache, extension)
    222     except HTTPError as err:
    223         if '404' in str(err):
--> 224             raise ValueError("Dataset %s was not found." % dataname)
    225         else:
    226             raise err

ValueError: Dataset Duncan was not found.
In [37]:
print(prestige.head(10))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-37-edc25e91563c> in <module>()
----> 1 print(prestige.head(10))

NameError: name 'prestige' is not defined
In [38]:
fig = plt.figure(figsize=(12,12))
ax1 = fig.add_subplot(211, xlabel='Income', ylabel='Prestige')
ax1.scatter(prestige.income, prestige.prestige)
xy_outlier = prestige.ix['minister'][['income','prestige']]
ax1.annotate('Minister', xy_outlier, xy_outlier+1, fontsize=16)
ax2 = fig.add_subplot(212, xlabel='Education',
                           ylabel='Prestige')
ax2.scatter(prestige.education, prestige.prestige);
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-38-274f453c76c3> in <module>()
      1 fig = plt.figure(figsize=(12,12))
      2 ax1 = fig.add_subplot(211, xlabel='Income', ylabel='Prestige')
----> 3 ax1.scatter(prestige.income, prestige.prestige)
      4 xy_outlier = prestige.ix['minister'][['income','prestige']]
      5 ax1.annotate('Minister', xy_outlier, xy_outlier+1, fontsize=16)

NameError: name 'prestige' is not defined
In [39]:
ols_model = ols('prestige ~ income + education', prestige).fit()
print(ols_model.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-39-bbe35b8ed986> in <module>()
----> 1 ols_model = ols('prestige ~ income + education', prestige).fit()
      2 print(ols_model.summary())

NameError: name 'prestige' is not defined
In [40]:
infl = ols_model.get_influence()
student = infl.summary_frame()['student_resid']
print(student)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-40-c124cc8482bc> in <module>()
----> 1 infl = ols_model.get_influence()
      2 student = infl.summary_frame()['student_resid']
      3 print(student)

NameError: name 'ols_model' is not defined
In [41]:
print(student.ix[np.abs(student) > 2])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-41-1e04d194bd48> in <module>()
----> 1 print(student.ix[np.abs(student) > 2])

NameError: name 'student' is not defined
In [42]:
print(infl.summary_frame().ix['minister'])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-42-171ee44fca4b> in <module>()
----> 1 print(infl.summary_frame().ix['minister'])

NameError: name 'infl' is not defined
In [43]:
sidak = ols_model.outlier_test('sidak')
sidak.sort('unadj_p', inplace=True)
print(sidak)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-43-2e23bf24d784> in <module>()
----> 1 sidak = ols_model.outlier_test('sidak')
      2 sidak.sort('unadj_p', inplace=True)
      3 print(sidak)

NameError: name 'ols_model' is not defined
In [44]:
fdr = ols_model.outlier_test('fdr_bh')
fdr.sort('unadj_p', inplace=True)
print(fdr)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-44-ef10f7ed26ee> in <module>()
----> 1 fdr = ols_model.outlier_test('fdr_bh')
      2 fdr.sort('unadj_p', inplace=True)
      3 print(fdr)

NameError: name 'ols_model' is not defined
In [45]:
rlm_model = rlm('prestige ~ income + education', prestige).fit()
print(rlm_model.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-45-f1afe0449c38> in <module>()
----> 1 rlm_model = rlm('prestige ~ income + education', prestige).fit()
      2 print(rlm_model.summary())

NameError: name 'prestige' is not defined
In [46]:
print(rlm_model.weights)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-46-758d2e6db2b4> in <module>()
----> 1 print(rlm_model.weights)

NameError: name 'rlm_model' is not defined

Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points

  • Data is on the luminosity and temperature of 47 stars in the direction of Cygnus.
In [47]:
dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data
In [48]:
from matplotlib.patches import Ellipse
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='log(Temp)', ylabel='log(Light)', title='Hertzsprung-Russell Diagram of Star Cluster CYG OB1')
ax.scatter(*dta.values.T)
# highlight outliers
e = Ellipse((3.5, 6), .2, 1, alpha=.25, color='r')
ax.add_patch(e);
ax.annotate('Red giants', xy=(3.6, 6), xytext=(3.8, 6),
            arrowprops=dict(facecolor='black', shrink=0.05, width=2),
            horizontalalignment='left', verticalalignment='bottom',
            clip_on=True, # clip to the axes bounding box
            fontsize=16,
     )
# annotate these with their index
for i,row in dta.ix[dta['log.Te'] < 3.8].iterrows():
    ax.annotate(i, row, row + .01, fontsize=14)
xlim, ylim = ax.get_xlim(), ax.get_ylim()
/usr/lib/python3/dist-packages/ipykernel_launcher.py:15: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
In [49]:
from IPython.display import Image
Image(filename='star_diagram.png')
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-49-37020820c14e> in <module>()
      1 from IPython.display import Image
----> 2 Image(filename='star_diagram.png')

/usr/lib/python3/dist-packages/IPython/core/display.py in __init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata)
   1019         self.unconfined = unconfined
   1020         self.metadata = metadata
-> 1021         super(Image, self).__init__(data=data, url=url, filename=filename)
   1022 
   1023         if retina:

/usr/lib/python3/dist-packages/IPython/core/display.py in __init__(self, data, url, filename)
    611         self.filename = None if filename is None else unicode_type(filename)
    612 
--> 613         self.reload()
    614         self._check_data()
    615 

/usr/lib/python3/dist-packages/IPython/core/display.py in reload(self)
   1041         """Reload the raw data from file or URL."""
   1042         if self.embed:
-> 1043             super(Image,self).reload()
   1044             if self.retina:
   1045                 self._retina_shape()

/usr/lib/python3/dist-packages/IPython/core/display.py in reload(self)
    629         """Reload the raw data from file or URL."""
    630         if self.filename is not None:
--> 631             with open(self.filename, self._read_flags) as f:
    632                 self.data = f.read()
    633         elif self.url is not None:

FileNotFoundError: [Errno 2] No such file or directory: 'star_diagram.png'
In [50]:
y = dta['log.light']
X = sm.add_constant(dta['log.Te'], prepend=True)
ols_model = sm.OLS(y, X).fit()
abline_plot(model_results=ols_model, ax=ax)
Out[50]:
In [51]:
rlm_mod = sm.RLM(y, X, sm.robust.norms.TrimmedMean(.5)).fit()
abline_plot(model_results=rlm_mod, ax=ax, color='red')
Out[51]:
  • Why? Because M-estimators are not robust to leverage points.
In [52]:
infl = ols_model.get_influence()
In [53]:
h_bar = 2*(ols_model.df_model + 1 )/ols_model.nobs
hat_diag = infl.summary_frame()['hat_diag']
hat_diag.ix[hat_diag > h_bar]
/usr/lib/python3/dist-packages/ipykernel_launcher.py:3: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
Out[53]:
10    0.194103
19    0.194103
29    0.198344
33    0.194103
Name: hat_diag, dtype: float64
In [54]:
sidak2 = ols_model.outlier_test('sidak')
sidak2.sort('unadj_p', inplace=True)
print(sidak2)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-54-a7d965d3ecb2> in <module>()
      1 sidak2 = ols_model.outlier_test('sidak')
----> 2 sidak2.sort('unadj_p', inplace=True)
      3 print(sidak2)

/usr/lib/python3/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   3612             if name in self._info_axis:
   3613                 return self[name]
-> 3614             return object.__getattribute__(self, name)
   3615 
   3616     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'sort'
In [55]:
fdr2 = ols_model.outlier_test('fdr_bh')
fdr2.sort('unadj_p', inplace=True)
print(fdr2)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-55-33b146e0bbad> in <module>()
      1 fdr2 = ols_model.outlier_test('fdr_bh')
----> 2 fdr2.sort('unadj_p', inplace=True)
      3 print(fdr2)

/usr/lib/python3/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   3612             if name in self._info_axis:
   3613                 return self[name]
-> 3614             return object.__getattribute__(self, name)
   3615 
   3616     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'sort'
  • Let's delete that line
In [56]:
del ax.lines[-1]
In [57]:
weights = np.ones(len(X))
weights[X[X['log.Te'] < 3.8].index.values - 1] = 0
wls_model = sm.WLS(y, X, weights=weights).fit()
abline_plot(model_results=wls_model, ax=ax, color='green')
Out[57]:
  • MM estimators are good for this type of problem, unfortunately, we don't yet have these yet.
  • It's being worked on, but it gives a good excuse to look at the R cell magics in the notebook.
In [58]:
yy = y.values[:,None]
xx = X['log.Te'].values[:,None]
In [59]:
%load_ext rpy2.ipython

%R library(robustbase)
%Rpush yy xx
%R mod <- lmrob(yy ~ xx);
%R params <- mod$coefficients;
%Rpull params
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-59-9555d23845d1> in <module>()
----> 1 get_ipython().magic('load_ext rpy2.ipython')
      2 
      3 get_ipython().magic('R library(robustbase)')
      4 get_ipython().magic('Rpush yy xx')
      5 get_ipython().magic('R mod <- lmrob(yy ~ xx);')

/usr/lib/python3/dist-packages/IPython/core/interactiveshell.py in magic(self, arg_s)
   2158         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2159         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2160         return self.run_line_magic(magic_name, magic_arg_s)
   2161 
   2162     #-------------------------------------------------------------------------

/usr/lib/python3/dist-packages/IPython/core/interactiveshell.py in run_line_magic(self, magic_name, line)
   2079                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2080             with self.builtin_trap:
-> 2081                 result = fn(*args,**kwargs)
   2082             return result
   2083 

<decorator-gen-63> in load_ext(self, module_str)

/usr/lib/python3/dist-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/usr/lib/python3/dist-packages/IPython/core/magics/extension.py in load_ext(self, module_str)
     35         if not module_str:
     36             raise UsageError('Missing module name.')
---> 37         res = self.shell.extension_manager.load_extension(module_str)
     38 
     39         if res == 'already loaded':

/usr/lib/python3/dist-packages/IPython/core/extensions.py in load_extension(self, module_str)
     81             if module_str not in sys.modules:
     82                 with prepended_to_syspath(self.ipython_extension_dir):
---> 83                     __import__(module_str)
     84             mod = sys.modules[module_str]
     85             if self._call_load_ipython_extension(mod):

ModuleNotFoundError: No module named 'rpy2'
In [60]:
%R print(mod)
UsageError: Line magic function `%R` not found.
In [61]:
print(params)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-61-1b9f0c7a8c29> in <module>()
----> 1 print(params)

NameError: name 'params' is not defined
In [62]:
abline_plot(intercept=params[0], slope=params[1], ax=ax, color='green')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-62-c69ec2593f52> in <module>()
----> 1 abline_plot(intercept=params[0], slope=params[1], ax=ax, color='green')

NameError: name 'params' is not defined

Exercise: Breakdown points of M-estimator

In [63]:
np.random.seed(12345)
nobs = 200
beta_true = np.array([3, 1, 2.5, 3, -4])
X = np.random.uniform(-20,20, size=(nobs, len(beta_true)-1))
# stack a constant in front
X = sm.add_constant(X, prepend=True) # np.c_[np.ones(nobs), X]
mc_iter = 500
contaminate = .25 # percentage of response variables to contaminate
In [64]:
all_betas = []
for i in range(mc_iter):
    y = np.dot(X, beta_true) + np.random.normal(size=200)
    random_idx = np.random.randint(0, nobs, size=int(contaminate * nobs))
    y[random_idx] = np.random.uniform(-750, 750)
    beta_hat = sm.RLM(y, X).fit().params
    all_betas.append(beta_hat)
In [65]:
all_betas = np.asarray(all_betas)
se_loss = lambda x : np.linalg.norm(x, ord=2)**2
se_beta = lmap(se_loss, all_betas - beta_true)

Squared error loss

In [66]:
np.array(se_beta).mean()
Out[66]:
0.4450294873068618
In [67]:
all_betas.mean(0)
Out[67]:
array([ 2.99711706,  0.99898147,  2.49909344,  2.99712918, -3.99626521])
In [68]:
beta_true
Out[68]:
array([ 3. ,  1. ,  2.5,  3. , -4. ])
In [69]:
se_loss(all_betas.mean(0) - beta_true)
Out[69]:
3.236091328675419e-05