from collections import Mapping as _Mapping
from collections import defaultdict as _defaultdict
import itertools as _itertools
from sklearn.model_selection import GridSearchCV as _GridSearchCV
from sklearn.pipeline import Pipeline as _Pipeline
from sklearn.pipeline import FeatureUnion as _FeatureUnion
[docs]def set_grid(estimator, **grid):
"""Set the grid to search for the specified estimator
Overwrites any previously set grid.
Parameters
----------
grid : dict (str -> list of values)
Keyword arguments define the values to be searched for each specified
parameter.
Returns
-------
estimator
Useful for chaining
"""
estimator._param_grid = grid
return estimator
def _update_grid(dest, src, prefix=None):
# TODO: needs docs
if src is None:
return dest
if prefix:
src = [{prefix + k: v for k, v in d.items()}
for d in src]
out = []
for d1, d2 in _itertools.product(dest, src):
out_d = d1.copy()
out_d.update(d2)
out.append(out_d)
return out
def _build_param_grid(estimator):
grid = getattr(estimator, '_param_grid', {})
if isinstance(grid, _Mapping):
grid = [grid]
# handle estimator parameters having their own grids
for param_name, value in estimator.get_params().items():
if '__' not in param_name and hasattr(value, 'get_params'):
out = []
value_grid = _build_param_grid(value)
for sub_grid in grid:
if param_name in sub_grid:
sub_grid = [sub_grid]
else:
sub_grid = _update_grid([sub_grid], value_grid,
param_name + '__')
out.extend(sub_grid)
grid = out
# handle grid values having their own grids
out = []
for out_d in grid:
part = [out_d]
for param_name, values in out_d.items():
to_update = []
no_sub_grid = []
for v in values:
if hasattr(v, 'get_params'):
sub_grid = _build_param_grid(v)
if sub_grid is not None:
to_update.extend(_update_grid([{param_name: [v]}],
sub_grid,
param_name + '__'))
continue
no_sub_grid.append(v)
if no_sub_grid:
to_update.append({param_name: no_sub_grid})
part = _update_grid(part, to_update)
out.extend(part)
if out == [{}]:
return None
return out
[docs]def build_param_grid(estimator):
"""Determine the parameter grid annotated on the estimator
Parameters
----------
estimator : scikit-learn compatible estimator
Should have been annotated using :func:`set_grid`
Notes
-----
Most often, it is unnecessary for this to be used directly, and
:func:`make_grid_search` should be used instead.
"""
out = _build_param_grid(estimator)
if out is None:
return {}
elif len(out) == 1:
return out[0]
return out
def _check_estimator(estimator):
if isinstance(estimator, list):
estimator = set_grid(_Pipeline([('root', estimator[0])]),
root=estimator)
elif not hasattr(estimator, 'fit'):
raise ValueError('Expected estimator, but %r does not have .fit'
% estimator)
return estimator
[docs]def make_grid_search(estimator, **kwargs):
"""Construct a GridSearchCV with the given estimator and its set grid
Parameters
----------
estimator : (list of) estimator
When a list, the estimators are searched over.
kwargs
Other parameters to the
:class:`sklearn.model_selection.GridSearchCV` constructor.
"""
estimator = _check_estimator(estimator)
return _GridSearchCV(estimator, build_param_grid(estimator), **kwargs)
def _name_steps(steps, default='alt'):
"""Generate names for estimators."""
steps = [estimators if isinstance(estimators, list) else [estimators]
for estimators in steps]
names = []
for estimators in steps:
estimators = estimators[:]
if len(estimators) > 1:
while None in estimators:
estimators.remove(None)
step_names = {type(estimator).__name__.lower()
for estimator in estimators}
if len(step_names) > 1:
names.append(default)
else:
names.append(step_names.pop())
namecount = _defaultdict(int)
for name in names:
namecount[name] += 1
for k, v in list(namecount.items()):
if v == 1:
del namecount[k]
for i in reversed(range(len(names))):
name = names[i]
if name in namecount:
names[i] += "-%d" % namecount[name]
namecount[name] -= 1
named_steps = list(zip(names, [step[0] for step in steps]))
grid = {k: v for k, v in zip(names, steps) if len(v) > 1}
return named_steps, grid
[docs]def make_pipeline(*steps, **kwargs):
"""Construct a Pipeline with alternative estimators to search over
Parameters
----------
steps
Each step is specified as one of:
* an estimator instance
* None (meaning no transformation)
* a list of the above, indicating that a grid search should alternate
over the estimators (or None) in the list
kwargs
Keyword arguments to the constructor of
:class:`sklearn.pipeline.Pipeline`.
Examples
--------
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> from sklearn.feature_selection import SelectKBest
>>> from sklearn.decomposition import PCA
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.model_selection import ParameterGrid
>>> from searchgrid import make_pipeline, build_param_grid
>>> pipe = make_pipeline(CountVectorizer(),
... [TfidfTransformer(), None],
... [PCA(n_components=5), SelectKBest(k=5)],
... [set_grid(LogisticRegression(),
... C=[.1, 1., 10.]),
... RandomForestClassifier()])
>>> pipe.steps # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
[('countvectorizer', CountVectorizer(...)),
('tfidftransformer', TfidfTransformer(...)),
('alt-1', PCA(...)),
('alt-2', LogisticRegression(...))]
>>> n_combinations = len(ParameterGrid(build_param_grid(pipe)))
>>> n_combinations
... # 2 * 2 * (3 + 1)
16
Notes
-----
Each step is named according to the set of estimator types in its list:
* if a step has only one type of estimator (disregarding None), it takes
that estimator's class name (lowercased)
* if a step has estimators of mixed type, the step is named 'alt'
* if there are multiple steps of the same name using the above rules,
a suffix '-1', '-2', etc. is added.
"""
steps, grid = _name_steps(steps)
return set_grid(_Pipeline(steps, **kwargs), **grid)
[docs]def make_union(*transformers, **kwargs):
"""Construct a FeatureUnion with alternative estimators to search over
Parameters
----------
steps
Each step is specified as one of:
* an estimator instance
* None (meaning no features)
* a list of the above, indicating that a grid search should alternate
over the estimators (or None) in the list
kwargs
Keyword arguments to the constructor of
:class:`sklearn.pipeline.FeatureUnion`.
Notes
-----
Each step is named according to the set of estimator types in its list:
* if a step has only one type of estimator (disregarding None), it takes
that estimator's class name (lowercased)
* if a step has estimators of mixed type, the step is named 'alt'
* if there are multiple steps of the same name using the above rules,
a suffix '-1', '-2', etc. is added.
"""
steps, grid = _name_steps(transformers)
return set_grid(_FeatureUnion(steps, **kwargs), **grid)