Converter Forecasting QA¶

twinweaver.instruction.converter_forecasting_qa ¶

Classes¶

ConverterForecastingQA ¶

Bases: ConverterForecasting

A class for converting patient data into a format suitable for manual forecasting with a question-answering approach.

This class inherits from ConverterForecasting and implements additional methods to preprocess target data into categorical bins and generate prompts for the forecasting model.

Source code in twinweaver/instruction/converter_forecasting_qa.py

class ConverterForecastingQA(ConverterForecasting):
    """
    A class for converting patient data into a format suitable for manual forecasting
    with a question-answering approach.

    This class inherits from ConverterForecasting and implements additional methods
    to preprocess target data into
    categorical bins and generate prompts for the forecasting model.
    """

    def __init__(self, variable_stats: pd.DataFrame, **kwargs):
        super().__init__(**kwargs)
        self.variable_stats = variable_stats

    def _preprocess_target_into_qa(self, patient_split: DataSplitterForecastingOption) -> DataSplitterForecastingOption:
        """
        Preprocesses the target data into categories suitable for question-answering format.

        Parameters
        ----------
        patient_split : DataSplitterForecastingOption
            A data splitter object containing patient data split into events and target variables.

        Returns
        -------
        DataSplitterForecastingOption
            The updated patient_split object with target events converted into categorical bins.

        """

        assert self.variable_stats is not None, (
            "Variable statistics must be set up before preprocessing target into QA format."
            "Please run setup_statistics() first on the DataSplitterForecasting."
            "Alternatively, you can disable forecasting QA in the conversion call."
        )

        #: deep copy dict
        curr_patient_split = copy.deepcopy(patient_split)

        #: loop through all target variables
        targets = curr_patient_split.target_events_after_split
        all_variables = targets[self.config.event_name_col].unique().tolist()
        targets[self.config.event_value_col] = targets[self.config.event_value_col].astype("string")
        curr_patient_split.category_splits = {}

        for var in all_variables:
            # Check if variable is numeric or categorical
            is_numeric = self.variable_stats.loc[
                self.variable_stats[self.config.event_name_col] == var, "is_numeric"
            ].values[0]
            if not is_numeric:
                raise ValueError(f"Variable {var} is not numeric, cannot bin for QA forecasting.")

            #: change every variable in target to be a category based on variable stats in event_value
            #: use e.g. "a" (= "Bin (0,2]") as new value

            splits = self.variable_stats.loc[
                self.variable_stats[self.config.event_name_col] == var,
                self.config.bins_split_name,
            ].values[0]

            # Replace both edges for splits with -inf/inf
            splits[0] = -float("inf")
            splits[-1] = float("inf")
            bin_ranges = [
                f"Bin ({round_and_strip(splits[i], self.decimal_precision)}, "
                + f"{round_and_strip(splits[i + 1], self.decimal_precision)}]"
                for i in range(len(splits) - 1)
            ]

            # Change last bin closing to be exclusive for inf
            bin_ranges[-1] = bin_ranges[-1].replace("]", ")")

            #: make bins into A, B, C, etc.
            bin_names = ["A", "B", "C", "D", "E", "F"]
            bin_mapping = {bin_names[i]: bin_ranges[i] for i in range(len(bin_ranges))}
            all_bins = list(bin_mapping.keys())

            # Apply binning
            curr_targets = (
                targets[targets[self.config.event_name_col] == var].loc[:, self.config.event_value_col].copy()
            )
            target_selection = targets[self.config.event_name_col] == var

            new_targets = pd.cut(
                curr_targets.astype(float),
                bins=splits,
                labels=all_bins,
                include_lowest=True,
            )
            new_targets = new_targets.astype("string")
            targets.loc[target_selection, self.config.event_value_col] = new_targets

            #: add in curr_patient_split["category_splits"]
            curr_patient_split.category_splits[var] = bin_mapping

        #: save & return
        curr_patient_split.target_events_after_split = targets
        return curr_patient_split

    def _generate_prompt(self, target_meta: dict) -> str:
        """
        Generates a prompt string for the forecasting model based on the target
        metadata and variable statistics.

        Parameters
        ----------
        target_meta : dict
            A dictionary containing metadata about the target variables.

        Returns
        -------
        str
            The generated prompt string for the forecasting model.

        """

        # start ret
        ret_prompt = ""

        #: make sure to round weeks from future_prediction_time_per_variable to predict to 2 decimals
        future_prediction_time_per_variable = target_meta["future_prediction_time_per_variable"]
        future_prediction_time_per_variable = {
            k: [round_and_strip(v2, self.decimal_precision) for v2 in v]
            for k, v in future_prediction_time_per_variable.items()
        }

        #: add base prompt
        ret_prompt += self.config.qa_prompt_start

        #: sort alphabetically by variable name
        future_prediction_time_per_variable = dict(
            sorted(future_prediction_time_per_variable.items(), key=lambda item: item[0])
        )

        #: create prompt for which variables to predict
        for variable, time in future_prediction_time_per_variable.items():
            #: need event_descriptive_name
            variable_desc_name = target_meta["variable_name_mapping"][variable]

            # Create prompt
            ret_prompt += "\n\t" + variable_desc_name
            ret_prompt += self.config.forecasting_prompt_var_time
            ret_prompt += str(time).replace("[", "").replace("]", "").replace("'", "")

            #: add in string for which varaibles have which category values
            processed_bins = str(target_meta["category_splits"][variable]).lower()
            processed_bins = processed_bins.replace("{", "").replace("}", "").replace("'", "")

            ret_prompt += "\n" + self.config.qa_bins_start
            ret_prompt += processed_bins

        #: return
        return ret_prompt

    def forward_conversion(self, patient_split: DataSplitterForecastingOption) -> tuple:
        """
        Converts the patient data into a format suitable for the forecasting model,
        including preprocessing and prompt generation.

        Parameters
        ----------
        patient_split : DataSplitterForecastingOption
            A data splitter object containing patient data split into events and target variables.

        Returns
        -------
        prompt_str : str
            The generated prompt string.
        target_str : str
            The generated target string.
        target_meta : dict
            Metadata associated with the target.
        """

        #: preprocess target data into categories
        raw_target = patient_split.target_events_after_split.copy()
        patient_split = self._preprocess_target_into_qa(patient_split)

        #: generate target string
        target_str, target_meta = self._generate_target_string(patient_split)

        #: make sure that target_meta contains the new targets in categories
        target_meta["category_splits"] = patient_split.category_splits
        target_meta["target_data_raw"] = raw_target

        #: generate prompt (including when to generate what)
        prompt_str = self._generate_prompt(target_meta)

        return prompt_str, target_str, target_meta

Functions¶

forward_conversion ¶

forward_conversion(patient_split)

Converts the patient data into a format suitable for the forecasting model, including preprocessing and prompt generation.

Parameters:

Name	Type	Description	Default
`patient_split`	`DataSplitterForecastingOption`	A data splitter object containing patient data split into events and target variables.	required

Returns:

Name	Type	Description
`prompt_str`	`str`	The generated prompt string.
`target_str`	`str`	The generated target string.
`target_meta`	`dict`	Metadata associated with the target.

Source code in twinweaver/instruction/converter_forecasting_qa.py

def forward_conversion(self, patient_split: DataSplitterForecastingOption) -> tuple:
    """
    Converts the patient data into a format suitable for the forecasting model,
    including preprocessing and prompt generation.

    Parameters
    ----------
    patient_split : DataSplitterForecastingOption
        A data splitter object containing patient data split into events and target variables.

    Returns
    -------
    prompt_str : str
        The generated prompt string.
    target_str : str
        The generated target string.
    target_meta : dict
        Metadata associated with the target.
    """

    #: preprocess target data into categories
    raw_target = patient_split.target_events_after_split.copy()
    patient_split = self._preprocess_target_into_qa(patient_split)

    #: generate target string
    target_str, target_meta = self._generate_target_string(patient_split)

    #: make sure that target_meta contains the new targets in categories
    target_meta["category_splits"] = patient_split.category_splits
    target_meta["target_data_raw"] = raw_target

    #: generate prompt (including when to generate what)
    prompt_str = self._generate_prompt(target_meta)

    return prompt_str, target_str, target_meta