Python源码示例:sklearn.CRF

示例1
def _train_model(self,
                     df_train: List[List[Tuple[Text, Text, Text, Text]]]
                     ) -> None:
        """Train the crf tagger based on the training data."""
        import sklearn_crfsuite

        X_train = [self._sentence_to_features(sent) for sent in df_train]
        y_train = [self._sentence_to_labels(sent) for sent in df_train]
        self.ent_tagger = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            # coefficient for L1 penalty
            c1=self.component_config["L1_c"],
            # coefficient for L2 penalty
            c2=self.component_config["L2_c"],
            # stop earlier
            max_iterations=self.component_config["max_iterations"],
            # include transitions that are possible, but not observed
            all_possible_transitions=True
        )
        self.ent_tagger.fit(X_train, y_train) 
示例2
def get_sequence_probability(self, tokens, labels):
        """Gives the joint probability of a sequence of tokens and CRF labels

        Args:
            tokens (list of :class:`.Token`): list of tokens
            labels (list of str): CRF labels with their tagging scheme prefix
                ("B-color", "I-color", "O", etc)

        Note:
            The absolute value returned here is generally not very useful,
            however it can be used to compare a sequence of labels relatively
            to another one.
        """
        if not self.slot_name_mapping:
            return 0.0 if any(label != OUTSIDE for label in labels) else 1.0
        features = self.compute_features(tokens)
        return self._get_sequence_probability(features, labels) 
示例3
def _ensure_safe(X, Y):
    """Ensures that Y has at least one not empty label, otherwise the CRF model
    does not contain any label and crashes at

    Args:
        X: features
        Y: labels

    Returns:
        (safe_X, safe_Y): a pair of safe features and labels
    """
    safe_X = list(X)
    safe_Y = list(Y)
    if not any(X) or not any(Y):
        safe_X.append([""])  # empty feature
        safe_Y.append([OUTSIDE])  # outside label
    return safe_X, safe_Y 
示例4
def trainEntities(self, model, trainingData):
        
        try:
        
            dataset = self.createDataset(trainingData['intents'],nlp)
            print(dataset)
            X_train = [self.sent2features(s) for s in dataset]
            y_train = [self.sent2labels(s) for s in dataset]
            crf = sklearn_crfsuite.CRF(
                algorithm='lbfgs', 
                c1=0.1, 
                c2=0.1, 
                max_iterations=100, 
                all_possible_transitions=True
            )
            crf.fit(X_train, y_train)
            
            joblib.dump(crf,"models/mitie_"+str(model)+".pkl")
            
            print("Saving Model "+str(model)+" Entities")
            
        except Exception as ex:
            
            print("Error Training Model "+str(model)+" Entities "+str(ex)) 
示例5
def __init__(
        self,
        component_config: Optional[Dict[Text, Any]] = None,
        entity_taggers: Optional[Dict[Text, "CRF"]] = None,
    ) -> None:

        super().__init__(component_config)

        self.entity_taggers = entity_taggers

        self.crf_order = [
            ENTITY_ATTRIBUTE_TYPE,
            ENTITY_ATTRIBUTE_ROLE,
            ENTITY_ATTRIBUTE_GROUP,
        ]

        self._validate_configuration() 
示例6
def _train_model(self, df_train):
        # type: (List[List[Tuple[Text, Text, Text, Text]]]) -> None
        """Train the crf tagger based on the training data."""
        import sklearn_crfsuite

        X_train = [self._sentence_to_features(sent) for sent in df_train]
        y_train = [self._sentence_to_labels(sent) for sent in df_train]
        self.ent_tagger = sklearn_crfsuite.CRF(
                algorithm='lbfgs',
                # coefficient for L1 penalty
                c1=self.component_config["L1_c"],
                # coefficient for L2 penalty
                c2=self.component_config["L2_c"],
                # stop earlier
                max_iterations=self.component_config["max_iterations"],
                # include transitions that are possible, but not observed
                all_possible_transitions=True
        )
        self.ent_tagger.fit(X_train, y_train) 
示例7
def extract_features(self, examples, config, resources, y=None, fit=True):
        """Transforms a list of examples into a feature matrix.

        Args:
            examples (list of mindmeld.core.Query): a list of queries
            config (ModelConfig): The ModelConfig which may contain information used for feature
                                  extraction
            resources (dict): Resources which may be used for this model's feature extraction

        Returns:
            (list of list of str): features in CRF suite format
        """
        # Extract features and classes
        feats = []
        for _, example in enumerate(examples):
            feats.append(self.extract_example_features(example, config, resources))
        X = self._preprocess_data(feats, fit)
        return X, y, None 
示例8
def _preprocess_data(self, X, fit=False):
        """Converts data into formats of CRF suite.

        Args:
            X (list of dict): features of an example
            fit (bool, optional): True if processing data at fit time, false for predict time.

        Returns:
            (list of list of str): features in CRF suite format
        """
        if fit:
            self._feat_binner.fit(X)

        new_X = []
        for feat_seq in self._feat_binner.transform(X):
            feat_list = []
            for feature in feat_seq:
                temp_list = []
                for feat_type in sorted(feature.keys()):
                    temp_list.append("{}={}".format(feat_type, str(feature[feat_type])))
                feat_list.append(temp_list)
            new_X.append(feat_list)
        return new_X 
示例9
def __init__(self, config=None, **shared):
        """The CRF slot filler can be configured by passing a
        :class:`.CRFSlotFillerConfig`"""
        # The CRFSlotFillerConfig must be deep-copied as it is mutated when
        # fitting the feature factories
        config = deepcopy(config)
        super(CRFSlotFiller, self).__init__(config, **shared)
        self.crf_model = None
        self.features_factories = [
            CRFFeatureFactory.from_config(conf, **shared)
            for conf in self.config.feature_factory_configs]
        self._features = None
        self.language = None
        self.intent = None
        self.slot_name_mapping = None 
示例10
def features(self):
        """List of :class:`.Feature` used by the CRF"""
        if self._features is None:
            self._features = []
            feature_names = set()
            for factory in self.features_factories:
                for feature in factory.build_features():
                    if feature.name in feature_names:
                        raise KeyError("Duplicated feature: %s" % feature.name)
                    feature_names.add(feature.name)
                    self._features.append(feature)
        return self._features 
示例11
def labels(self):
        """List of CRF labels

        These labels differ from the slot names as they contain an additional
        prefix which depends on the :class:`.TaggingScheme` that is used
        (BIO by default).
        """
        labels = []
        if self.crf_model.tagger_ is not None:
            labels = [_decode_tag(label) for label in
                      self.crf_model.tagger_.labels()]
        return labels 
示例12
def _get_crf_model(crf_args):
    from sklearn_crfsuite import CRF

    model_filename = crf_args.get("model_filename", None)
    if model_filename is not None:
        directory = Path(model_filename).parent
        if not directory.is_dir():
            mkdir_p(directory)

    return CRF(model_filename=model_filename, **crf_args) 
示例13
def test_ensure_safe(self):
        unsafe_examples = [
            ([[]], [[]]),
            ([[], []], [[], []]),
        ]

        # We don't assert anything here but it segfault otherwise
        for x, y in unsafe_examples:
            x, y = _ensure_safe(x, y)
            model = CRF().fit(x, y)
            model.predict_single([""]) 
示例14
def get_crf():
    """
    :return: a CRF learner with the specification used by medaCy
    """
    return sklearn_crfsuite.CRF(
                algorithm='l2sgd',
                c2=0.1,
                max_iterations=100,
                all_possible_transitions=True
            ) 
示例15
def __init__(self, language_model=None):
        values = {'algorithm': 'lbfgs',
                  # coefficient for L1 penalty
                  'c1': 1,
                  # coefficient for L2 penalty
                  'c2': 1e-3,
                  'max_iterations': 50,
                  # include transitions that are possible, but not observed
                  'all_possible_transitions': True}

        self.ent_tagger = sklearn_crfsuite.CRF(**values)
        self._language_model = language_model 
示例16
def _add_tag_to_crf_token(
        self,
        crf_tokens: List[CRFToken],
        predictions: Dict[Text, List[Dict[Text, float]]],
    ):
        """Add predicted entity tags to CRF tokens."""
        if ENTITY_ATTRIBUTE_TYPE in predictions:
            _tags, _ = self._most_likely_tag(predictions[ENTITY_ATTRIBUTE_TYPE])
            for tag, token in zip(_tags, crf_tokens):
                token.entity_tag = tag 
示例17
def _train_model(self, df_train: List[List[CRFToken]]) -> None:
        """Train the crf tagger based on the training data."""
        import sklearn_crfsuite

        self.entity_taggers = {}

        for tag_name in self.crf_order:
            logger.debug(f"Training CRF for '{tag_name}'.")

            # add entity tag features for second level CRFs
            include_tag_features = tag_name != ENTITY_ATTRIBUTE_TYPE
            X_train = [
                self._crf_tokens_to_features(sentence, include_tag_features)
                for sentence in df_train
            ]
            y_train = [
                self._crf_tokens_to_tags(sentence, tag_name) for sentence in df_train
            ]

            entity_tagger = sklearn_crfsuite.CRF(
                algorithm="lbfgs",
                # coefficient for L1 penalty
                c1=self.component_config["L1_c"],
                # coefficient for L2 penalty
                c2=self.component_config["L2_c"],
                # stop earlier
                max_iterations=self.component_config["max_iterations"],
                # include transitions that are possible, but not observed
                all_possible_transitions=True,
            )
            entity_tagger.fit(X_train, y_train)

            self.entity_taggers[tag_name] = entity_tagger

            logger.debug("Training finished.") 
示例18
def __init__(self, component_config=None, ent_tagger=None):
        # type: (sklearn_crfsuite.CRF, Dict[Text, Any]) -> None

        super(CRFEntityExtractor, self).__init__(component_config)

        self.ent_tagger = ent_tagger

        self._validate_configuration() 
示例19
def set_params(self, **parameters):
        self._clf = CRF()
        self._clf.set_params(**parameters)
        return self