Source code for delta.features
"""
Feature selection utilities.
"""
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import StratifiedKFold
import numpy as np
import sklearn.svm
[docs]def get_rfe_features(corpus, estimator=None,
steps=[(10000, 1000), (1000, 200), (500, 25)], cv=True):
"""
Args:
corpus: containing document_describer,
estimator: supervised learning estimator,
steps: list of tuples (features_to_select, step)
cv: additional cross-validated selection.
Returns:
rfe_terms: set of selected terms.
"""
if estimator is None:
estimator = sklearn.svm.SVC(kernel="linear")
matrix = corpus
groups = np.array([corpus.document_describer.group_name(x)
for x in corpus.index])
terms = np.array(corpus.columns)
for step in steps:
rfe = RFE(estimator=estimator, n_features_to_select=step[0],
step=step[1])
matrix = rfe.fit_transform(matrix, groups)
terms = terms[rfe.support_]
# cross-validation
if cv:
#rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(groups, n_folds=3), scoring='accuracy')
rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(3), scoring='accuracy')
rfecv.fit(matrix, groups)
rfe_terms = terms[rfecv.support_]
return set(rfe_terms)