"""
Feature base class and implementations.
This module is part of the internal implementation and should not be imported directly.
Use the public API in the root package instead.
"""
from abc import abstractmethod, ABC
from functools import partial
import pandas as pd
import numpy as np
from ..utils import is_null, get_logger, type_check
from py_stringmatching import Levenshtein, NeedlemanWunsch, SmithWaterman
#from active_matcher.fv_generator import BuildCache
log = get_logger(__name__)
[docs]
class Feature(ABC):
def __init__(self, a_attr : str, b_attr : str):
if not isinstance(a_attr, str) or not isinstance(b_attr, str):
raise TypeError(f'a_attr and b_attr must be strings not {type(a_attr), type(b_attr)}')
self._a_attr = a_attr
self._b_attr = b_attr
[docs]
def build(self, A, B, cache):
'''
Guarenteed to be called before the features preprocessing is done.
this method should generate and store all of the metadata required to
compute the features over A and B, NOTE B may be None
'''
pass
[docs]
@classmethod
def template(cls, **kwargs):
return partial(cls, **kwargs)
@property
def a_attr(self):
"""
the name of the attribute from table a used to generate this feature
"""
return self._a_attr
@property
def b_attr(self):
"""
the name of the attribute from table a used to generate this feature
"""
return self._b_attr
@abstractmethod
def __str__(self):
"""
return a string representation of this feature, this should uniquely identify the feature
"""
pass
@abstractmethod
def __call__(self, A : dict, B : pd.DataFrame) -> pd.Series:
"""
compute the feature with A for each row in B, both A and B are preprocessed
"""
pass
@abstractmethod
def _preprocess(self, data : pd.DataFrame, input_col : str) -> pd.Series:
'''
this method should perform preprocessing for the input_col and
return a series with the preprocessing data with name _preprocess_output_column(input_col)
'''
pass
@abstractmethod
def _preprocess_output_column(self):
'''
the name of the column that will be output for preprocessing
this features. Return None if there is no preprocessing that needs to be
done for this feature. This column name + a row id must unique identify
an object in the preprocessing output. For example, jaccard_3gram(a_name, b_name)
would probably output 3gram_tokens(a_name) for preprocessing table A. Note that
If these are name collisions, preprocessing will sliently skip processing data and
lead to strange behavior.
'''
pass
[docs]
def preprocess_output_column(self, for_table_a : bool):
"""
get the name of the preprocessing output column for table A or B
"""
if for_table_a:
return self._preprocess_output_column(self.a_attr)
else:
return self._preprocess_output_column(self.b_attr)
[docs]
def preprocess(self, data, is_table_a):
"""
preprocess the data, adding the output column to data
"""
out_col = self.preprocess_output_column(is_table_a)
if out_col is None:
return data
if is_table_a:
if self.preprocess_output_column(is_table_a) not in data:
c = self._preprocess(data, self.a_attr)
data[c.name] = c
else:
if self.preprocess_output_column(False) not in data:
c = self._preprocess(data, self.b_attr)
data[c.name] = c
return data
class ExactMatchFeature(Feature):
"""
Case insensitive exact string match
"""
def _preprocess_output_column(self, for_table_a):
return None
def _preprocess(self, data, is_table_a):
return data
def __call__(self, rec, recs):
s = rec[self.b_attr]
strings = recs[self.a_attr]
if not isinstance(s, str):
return pd.Series(np.nan, index=strings.index)
strings = strings.apply(lambda x : x.lower() if pd.notnull(x) else None)
return strings.eq(s.lower()).astype(np.float64)
def __str__(self):
return f'exact_match({self.a_attr}, {self.b_attr})'
class RelDiffFeature(Feature):
"""
relative difference between two values
"""
def __init__(self, a_attr, b_attr):
super().__init__(a_attr, b_attr)
self._a_float_col = self._preprocess_output_column(a_attr)
self._b_float_col = self._preprocess_output_column(b_attr)
def _preprocess_output_column(self, attr):
return f'float({attr})'
def _preprocess(self, data, input_col):
floats = data[input_col].apply(lambda x : float(x) if x is not None else None)
floats.name = self._preprocess_output_column(input_col)
return floats
def __call__(self, rec, recs):
f = rec[self._b_float_col]
floats = recs[self._a_float_col]
if is_null(f):
return pd.Series(np.nan, index=floats.index)
vals = floats.values.astype(np.float32)
return pd.Series(np.abs(vals - f) / np.maximum(np.abs(vals), np.abs(f)), index=floats.index).astype(np.float64)
def __str__(self):
return f'rel_diff({self.a_attr}, {self.b_attr})'
class EditDistanceFeature(Feature):
"""
edit distance between two strings, case insensitive
"""
_func = Levenshtein().get_sim_score
def _preprocess_output_column(self, for_table_a):
return None
def _preprocess(self, data, is_table_a):
return data
def __call__(self, rec, recs):
s = rec[self.b_attr]
strings = recs[self.a_attr]
if not isinstance(s, str):
return pd.Series(np.nan, index=strings.index)
strings = strings.apply(lambda x : str(x).lower() if pd.notnull(x) else None)
return strings.apply(lambda x : self._func(s, x) if x is not None else np.nan).astype(np.float64)
def __str__(self):
return f'edit_distance({self.a_attr}, {self.b_attr})'
class NeedlemanWunschFeature(Feature):
"""
needleman_wunch between two strings, case insensitive
"""
_func = NeedlemanWunsch().get_raw_score
def _preprocess_output_column(self, for_table_a):
return None
def _preprocess(self, data, is_table_a):
return data
def _sim_func(self, x, y):
div = max(len(x), len(y))
if div != 0:
return self._func(x,y)
else:
return 0.0
def __call__(self, rec, recs):
s = rec[self.b_attr]
strings = recs[self.a_attr]
if not isinstance(s, str):
return pd.Series(np.nan, index=strings.index)
strings = strings.apply(lambda x : str(x).lower() if pd.notnull(x) else None)
return strings.apply(lambda x : self._sim_func(s, x) if x is not None else np.nan).astype(np.float64)
def __str__(self):
return f'needleman_wunch({self.a_attr}, {self.b_attr})'
class SmithWatermanFeature(Feature):
"""
smith waterman between two strings, case insensitive
"""
_func = SmithWaterman().get_raw_score
def _preprocess_output_column(self, for_table_a):
return None
def _preprocess(self, data, is_table_a):
return data
def _sim_func(self, x, y):
div = max(len(x), len(y))
if div != 0:
return self._func(x,y)
else:
return 0.0
def __call__(self, rec, recs):
s = rec[self.b_attr]
strings = recs[self.a_attr]
if not isinstance(s, str):
return pd.Series(np.nan, index=strings.index)
strings = strings.apply(lambda x : str(x).lower() if pd.notnull(x) else None)
return strings.apply(lambda x : self._sim_func(s, x) if x is not None else np.nan).astype(np.float64)
def __str__(self):
return f'smith_waterman({self.a_attr}, {self.b_attr})'