Version: 3.x

rasa.nlu.test

CVEvaluationResult Objects

class CVEvaluationResult(NamedTuple)

Stores NLU cross-validation results.

log_evaluation_table

def log_evaluation_table(report: Text, precision: float, f1: float,
                         accuracy: float) -> None

Log the sklearn evaluation metrics.

remove_empty_intent_examples

def remove_empty_intent_examples(
    intent_results: List[IntentEvaluationResult]
) -> List[IntentEvaluationResult]

Remove those examples without an intent.

Arguments:

intent_results - intent evaluation results
Returns - intent evaluation results

remove_empty_response_examples

def remove_empty_response_examples(
    response_results: List[ResponseSelectionEvaluationResult]
) -> List[ResponseSelectionEvaluationResult]

Remove those examples without a response.

Arguments:

response_results - response selection evaluation results

Returns:

Response selection evaluation results

drop_intents_below_freq

def drop_intents_below_freq(training_data: TrainingData,
                            cutoff: int = 5) -> TrainingData

Remove intent groups with less than cutoff instances.

Arguments:

training_data - training data
cutoff - threshold
Returns - updated training data

write_intent_successes

def write_intent_successes(intent_results: List[IntentEvaluationResult],
                           successes_filename: Text) -> None

Write successful intent predictions to a file.

Arguments:

intent_results - intent evaluation result
successes_filename - filename of file to save successful predictions to

write_response_successes

def write_response_successes(
        response_results: List[ResponseSelectionEvaluationResult],
        successes_filename: Text) -> None

Write successful response selection predictions to a file.

Arguments:

response_results - response selection evaluation result
successes_filename - filename of file to save successful predictions to

plot_attribute_confidences

def plot_attribute_confidences(results: Union[
    List[IntentEvaluationResult], List[ResponseSelectionEvaluationResult]],
                               hist_filename: Optional[Text], target_key: Text,
                               prediction_key: Text, title: Text) -> None

Create histogram of confidence distribution.

Arguments:

results - evaluation results
hist_filename - filename to save plot to
target_key - key of target in results
prediction_key - key of predictions in results
title - title of plot

plot_entity_confidences

def plot_entity_confidences(merged_targets: List[Text],
                            merged_predictions: List[Text],
                            merged_confidences: List[float],
                            hist_filename: Text, title: Text) -> None

Creates histogram of confidence distribution.

Arguments:

merged_targets - Entity labels.
merged_predictions - Predicted entities.
merged_confidences - Confidence scores of predictions.
hist_filename - filename to save plot to
title - title of plot

evaluate_response_selections

def evaluate_response_selections(
        response_selection_results: List[ResponseSelectionEvaluationResult],
        output_directory: Optional[Text],
        successes: bool,
        errors: bool,
        disable_plotting: bool,
        report_as_dict: Optional[bool] = None) -> Dict

Creates summary statistics for response selection.

Only considers those examples with a set response. Others are filtered out. Returns a dictionary of containing the evaluation result.

Arguments:

response_selection_results - response selection evaluation results
output_directory - directory to store files to
successes - if True success are written down to disk
errors - if True errors are written down to disk
disable_plotting - if True no plots are created
report_as_dict - True if the evaluation report should be returned as dict. If False the report is returned in a human-readable text format. If Nonereport_as_dict is considered as True in case an output_directory is given.
output_directory3 - dictionary with evaluation results

evaluate_intents

def evaluate_intents(intent_results: List[IntentEvaluationResult],
                     output_directory: Optional[Text],
                     successes: bool,
                     errors: bool,
                     disable_plotting: bool,
                     report_as_dict: Optional[bool] = None) -> Dict

Creates summary statistics for intents.

Only considers those examples with a set intent. Others are filtered out. Returns a dictionary of containing the evaluation result.

Arguments:

intent_results - intent evaluation results
output_directory - directory to store files to
successes - if True correct predictions are written to disk
errors - if True incorrect predictions are written to disk
disable_plotting - if True no plots are created
report_as_dict - True if the evaluation report should be returned as dict. If False the report is returned in a human-readable text format. If Nonereport_as_dict is considered as True in case an output_directory is given.
output_directory3 - dictionary with evaluation results

merge_labels

def merge_labels(aligned_predictions: List[Dict],
                 extractor: Optional[Text] = None) -> List[Text]

Concatenates all labels of the aligned predictions.

Takes the aligned prediction labels which are grouped for each message and concatenates them.

Arguments:

aligned_predictions - aligned predictions
extractor - entity extractor name

Returns:

Concatenated predictions

merge_confidences

def merge_confidences(aligned_predictions: List[Dict],
                      extractor: Optional[Text] = None) -> List[float]

Concatenates all confidences of the aligned predictions.

Takes the aligned prediction confidences which are grouped for each message and concatenates them.

Arguments:

aligned_predictions - aligned predictions
extractor - entity extractor name

Returns:

Concatenated confidences

substitute_labels

def substitute_labels(labels: List[Text], old: Text, new: Text) -> List[Text]

Replaces label names in a list of labels.

Arguments:

labels - list of labels
old - old label name that should be replaced
new - new label name
Returns - updated labels

collect_incorrect_entity_predictions

def collect_incorrect_entity_predictions(
        entity_results: List[EntityEvaluationResult],
        merged_predictions: List[Text],
        merged_targets: List[Text]) -> List["EntityPrediction"]

Get incorrect entity predictions.

Arguments:

entity_results - entity evaluation results
merged_predictions - list of predicted entity labels
merged_targets - list of true entity labels
Returns - list of incorrect predictions

write_successful_entity_predictions

def write_successful_entity_predictions(
        entity_results: List[EntityEvaluationResult],
        merged_targets: List[Text], merged_predictions: List[Text],
        successes_filename: Text) -> None

Write correct entity predictions to a file.

Arguments:

entity_results - response selection evaluation result
merged_predictions - list of predicted entity labels
merged_targets - list of true entity labels
successes_filename - filename of file to save correct predictions to

collect_successful_entity_predictions

def collect_successful_entity_predictions(
        entity_results: List[EntityEvaluationResult],
        merged_predictions: List[Text],
        merged_targets: List[Text]) -> List["EntityPrediction"]

Get correct entity predictions.

Arguments:

entity_results - entity evaluation results
merged_predictions - list of predicted entity labels
merged_targets - list of true entity labels
Returns - list of correct predictions

evaluate_entities

def evaluate_entities(entity_results: List[EntityEvaluationResult],
                      extractors: Set[Text],
                      output_directory: Optional[Text],
                      successes: bool,
                      errors: bool,
                      disable_plotting: bool,
                      report_as_dict: Optional[bool] = None) -> Dict

Creates summary statistics for each entity extractor.

Logs precision, recall, and F1 per entity type for each extractor.

Arguments:

entity_results - entity evaluation results
extractors - entity extractors to consider
output_directory - directory to store files to
successes - if True correct predictions are written to disk
errors - if True incorrect predictions are written to disk
disable_plotting - if True no plots are created
report_as_dict - True if the evaluation report should be returned as dict. If False the report is returned in a human-readable text format. If extractors0 report_as_dict is considered as True in case an output_directory is given.
extractors4 - dictionary with evaluation results

is_token_within_entity

def is_token_within_entity(token: Token, entity: Dict) -> bool

Checks if a token is within the boundaries of an entity.

does_token_cross_borders

def does_token_cross_borders(token: Token, entity: Dict) -> bool

Checks if a token crosses the boundaries of an entity.

determine_intersection

def determine_intersection(token: Token, entity: Dict) -> int

Calculates how many characters a given token and entity share.

do_entities_overlap

def do_entities_overlap(entities: List[Dict]) -> bool

Checks if entities overlap.

I.e. cross each others start and end boundaries.

Arguments:

entities - list of entities
Returns - true if entities overlap, false otherwise.

find_intersecting_entities

def find_intersecting_entities(token: Token,
                               entities: List[Dict]) -> List[Dict]

Finds the entities that intersect with a token.

Arguments:

token - a single token
entities - entities found by a single extractor
Returns - list of entities

pick_best_entity_fit

def pick_best_entity_fit(
        token: Token,
        candidates: List[Dict[Text, Any]]) -> Optional[Dict[Text, Any]]

Determines the best fitting entity given intersecting entities.

Arguments:

token - a single token
candidates - entities found by a single extractor
attribute_key - the attribute key of interest

Returns:

the value of the attribute key of the best fitting entity

determine_token_labels

def determine_token_labels(
        token: Token,
        entities: List[Dict],
        extractors: Optional[Set[Text]] = None,
        attribute_key: Text = ENTITY_ATTRIBUTE_TYPE) -> Text

Determines the token label for the provided attribute key given entities that do not overlap.

Arguments:

token - a single token
entities - entities found by a single extractor
extractors - list of extractors
attribute_key - the attribute key for which the entity type should be returned

Returns:

entity type

determine_entity_for_token

def determine_entity_for_token(
        token: Token,
        entities: List[Dict[Text, Any]],
        extractors: Optional[Set[Text]] = None) -> Optional[Dict[Text, Any]]

Determines the best fitting entity for the given token, given entities that do not overlap.

Arguments:

token - a single token
entities - entities found by a single extractor
extractors - list of extractors

Returns:

entity type

do_any_extractors_not_support_overlap

def do_any_extractors_not_support_overlap(
        extractors: Optional[Set[Text]]) -> bool

Checks if any extractor does not support overlapping entities.

Arguments:

Names of the entitiy extractors

Returns:

True if and only if CRFEntityExtractor or DIETClassifier is in extractors

align_entity_predictions

def align_entity_predictions(result: EntityEvaluationResult,
                             extractors: Set[Text]) -> Dict

Aligns entity predictions to the message tokens.

Determines for every token the true label based on the prediction targets and the label assigned by each single extractor.

Arguments:

result - entity evaluation result
extractors - the entity extractors that should be considered
Returns - dictionary containing the true token labels and token labels from the extractors

align_all_entity_predictions

def align_all_entity_predictions(entity_results: List[EntityEvaluationResult],
                                 extractors: Set[Text]) -> List[Dict]

Aligns entity predictions to the message tokens for the whole dataset using align_entity_predictions.

Arguments:

entity_results - list of entity prediction results
extractors - the entity extractors that should be considered
Returns - list of dictionaries containing the true token labels and token labels from the extractors

get_eval_data

async def get_eval_data(
    processor: MessageProcessor, test_data: TrainingData
) -> Tuple[
        List[IntentEvaluationResult],
        List[ResponseSelectionEvaluationResult],
        List[EntityEvaluationResult],
]

Runs the model for the test set and extracts targets and predictions.

Returns intent results (intent targets and predictions, the original messages and the confidences of the predictions), response results ( response targets and predictions) as well as entity results (entity_targets, entity_predictions, and tokens).

Arguments:

processor - the processor
test_data - test data
Returns - intent, response, and entity evaluation results

run_evaluation

async def run_evaluation(data_path: Text,
                         processor: MessageProcessor,
                         output_directory: Optional[Text] = None,
                         successes: bool = False,
                         errors: bool = False,
                         disable_plotting: bool = False,
                         report_as_dict: Optional[bool] = None,
                         domain_path: Optional[Text] = None) -> Dict

Evaluate intent classification, response selection and entity extraction.

Arguments:

data_path - path to the test data
processor - the processor used to process and predict
output_directory - path to folder where all output will be stored
successes - if true successful predictions are written to a file
errors - if true incorrect predictions are written to a file
disable_plotting - if true confusion matrix and histogram will not be rendered
report_as_dict - True if the evaluation report should be returned as dict. If False the report is returned in a human-readable text format. If processor0 report_as_dict is considered as True in case an output_directory is given.
processor4 - Path to the domain file(s).
processor5 - dictionary containing evaluation results

generate_folds

def generate_folds(
    n: int, training_data: TrainingData
) -> Iterator[Tuple[TrainingData, TrainingData]]

Generates n cross validation folds for given training data.

combine_result

async def combine_result(
    intent_metrics: IntentMetrics,
    entity_metrics: EntityMetrics,
    response_selection_metrics: ResponseSelectionMetrics,
    processor: MessageProcessor,
    data: TrainingData,
    intent_results: Optional[List[IntentEvaluationResult]] = None,
    entity_results: Optional[List[EntityEvaluationResult]] = None,
    response_selection_results: Optional[
        List[ResponseSelectionEvaluationResult]] = None
) -> Tuple[IntentMetrics, EntityMetrics, ResponseSelectionMetrics]

Collects intent, response selection and entity metrics for cross validation folds.

If intent_results, response_selection_results or entity_results is provided as a list, prediction results are also collected.

Arguments:

intent_metrics - intent metrics
entity_metrics - entity metrics
response_selection_metrics - response selection metrics
processor - the processor
data - training data
intent_results - intent evaluation results
entity_results - entity evaluation results
response_selection_results - reponse selection evaluation results
response_selection_results1 - intent, entity, and response selection metrics

cross_validate

async def cross_validate(
    data: TrainingData,
    n_folds: int,
    nlu_config: Union[Text, Dict],
    output: Optional[Text] = None,
    successes: bool = False,
    errors: bool = False,
    disable_plotting: bool = False,
    report_as_dict: Optional[bool] = None
) -> Tuple[CVEvaluationResult, CVEvaluationResult, CVEvaluationResult]

Stratified cross validation on data.

Arguments:

data - Training Data
n_folds - integer, number of cv folds
nlu_config - nlu config file
output - path to folder where reports are stored
successes - if true successful predictions are written to a file
errors - if true incorrect predictions are written to a file
disable_plotting - if true no confusion matrix and historgram plates are created
report_as_dict - True if the evaluation report should be returned as dict. If n_folds0 the report is returned in a human-readable text format. If n_folds1 report_as_dict is considered as True in case an n_folds4 is given.

Returns:

dictionary with key, list structure, where each entry in list corresponds to the relevant result for one fold

compute_metrics

async def compute_metrics(
    processor: MessageProcessor, training_data: TrainingData
) -> Tuple[
        IntentMetrics,
        EntityMetrics,
        ResponseSelectionMetrics,
        List[IntentEvaluationResult],
        List[EntityEvaluationResult],
        List[ResponseSelectionEvaluationResult],
]

Computes metrics for intent classification, response selection and entity extraction.

Arguments:

processor - the processor
training_data - training data
Returns - intent, response selection and entity metrics, and prediction results.

compare_nlu

async def compare_nlu(configs: List[Text], data: TrainingData,
                      exclusion_percentages: List[int],
                      f_score_results: Dict[Text, List[List[float]]],
                      model_names: List[Text], output: Text,
                      runs: int) -> List[int]

Trains and compares multiple NLU models. For each run and exclusion percentage a model per config file is trained. Thereby, the model is trained only on the current percentage of training data. Afterwards, the model is tested on the complete test data of that run. All results are stored in the provided output directory.

Arguments:

configs - config files needed for training
data - training data
exclusion_percentages - percentages of training data to exclude during comparison
f_score_results - dictionary of model name to f-score results per run
model_names - names of the models to train
output - the output directory
runs - number of comparison runs
Returns - training examples per run

log_results

def log_results(results: IntentMetrics, dataset_name: Text) -> None

Logs results of cross validation.

Arguments:

results - dictionary of results returned from cross validation
dataset_name - string of which dataset the results are from, e.g. test/train

log_entity_results

def log_entity_results(results: EntityMetrics, dataset_name: Text) -> None

Logs entity results of cross validation.

Arguments:

results - dictionary of dictionaries of results returned from cross validation
dataset_name - string of which dataset the results are from, e.g. test/train

CVEvaluationResult Objects#

log_evaluation_table#

remove_empty_intent_examples#

remove_empty_response_examples#

drop_intents_below_freq#

write_intent_successes#

write_response_successes#

plot_attribute_confidences#

plot_entity_confidences#

evaluate_response_selections#

evaluate_intents#

merge_labels#

merge_confidences#

substitute_labels#

collect_incorrect_entity_predictions#

write_successful_entity_predictions#

collect_successful_entity_predictions#

evaluate_entities#

is_token_within_entity#

does_token_cross_borders#

determine_intersection#

do_entities_overlap#

find_intersecting_entities#

pick_best_entity_fit#

determine_token_labels#

determine_entity_for_token#

do_any_extractors_not_support_overlap#

align_entity_predictions#

align_all_entity_predictions#

get_eval_data#

run_evaluation#

generate_folds#

combine_result#

cross_validate#

compute_metrics#

compare_nlu#

log_results#

log_entity_results#

CVEvaluationResult Objects

log_evaluation_table

remove_empty_intent_examples

remove_empty_response_examples

drop_intents_below_freq

write_intent_successes

write_response_successes

plot_attribute_confidences

plot_entity_confidences

evaluate_response_selections

evaluate_intents

merge_labels

merge_confidences

substitute_labels

collect_incorrect_entity_predictions

write_successful_entity_predictions

collect_successful_entity_predictions

evaluate_entities

is_token_within_entity

does_token_cross_borders

determine_intersection

do_entities_overlap

find_intersecting_entities

pick_best_entity_fit

determine_token_labels

determine_entity_for_token

do_any_extractors_not_support_overlap

align_entity_predictions

align_all_entity_predictions

get_eval_data

run_evaluation

generate_folds

combine_result

cross_validate

compute_metrics

compare_nlu

log_results

log_entity_results