Example of running inference¶

In [ ]:

Copied!





import pandas as pd

from twinweaver import (
    DataSplitterForecasting,
    DataManager,
    DataSplitterEvents,
    ConverterInstruction,
    Config,
    DataSplitter,
)
import pandas as pd

from twinweaver import (
    DataSplitterForecasting,
    DataManager,
    DataSplitterEvents,
    ConverterInstruction,
    Config,
    DataSplitter,
)

Setup basic data loading¶

In [ ]:

Copied!

df_events = pd.read_csv("./example_data/events.csv")
df_constant = pd.read_csv("./example_data/constant.csv")
df_constant_description = pd.read_csv("./example_data/constant_description.csv")
df_events = pd.read_csv("./example_data/events.csv")
df_constant = pd.read_csv("./example_data/constant.csv")
df_constant_description = pd.read_csv("./example_data/constant_description.csv")

In [ ]:

Copied!





config = Config()  # Override values here to customize pipeline
config.constant_columns_to_use = ["birthyear", "gender", "histology", "smoking_history"]  # Manually set from constant
config.constant_birthdate_column = "birthyear"

dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_dataset_splits()
dm.infer_var_types()

data_splitter_events = DataSplitterEvents(dm, config=config)
data_splitter_events.setup_variables()

data_splitter_forecasting = DataSplitterForecasting(
    data_manager=dm,
    config=config,
)
# In case you manually want to override the variables for forecasting selectiong, you can skip this next line.
data_splitter_forecasting.setup_statistics()

# We will use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)

converter = ConverterInstruction(
    nr_tokens_budget_total=8192,
    config=config,
    dm=dm,
    variable_stats=data_splitter_forecasting.variable_stats,  # Optional, needed for forecasting QA tasks
)
config = Config()  # Override values here to customize pipeline
config.constant_columns_to_use = ["birthyear", "gender", "histology", "smoking_history"]  # Manually set from constant
config.constant_birthdate_column = "birthyear"

dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_dataset_splits()
dm.infer_var_types()

data_splitter_events = DataSplitterEvents(dm, config=config)
data_splitter_events.setup_variables()

data_splitter_forecasting = DataSplitterForecasting(
    data_manager=dm,
    config=config,
)
# In case you manually want to override the variables for forecasting selectiong, you can skip this next line.
data_splitter_forecasting.setup_statistics()

# We will use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)

converter = ConverterInstruction(
    nr_tokens_budget_total=8192,
    config=config,
    dm=dm,
    variable_stats=data_splitter_forecasting.variable_stats,  # Optional, needed for forecasting QA tasks
)

Example patient data¶

In [ ]:

Copied!

patientid = dm.all_patientids[2]
patientid
patientid = dm.all_patientids[2]
patientid

In [ ]:

Copied!

patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")

# To simulate that we only have input, half the events
patient_data["events"] = patient_data["events"].iloc[: int(len(patient_data["events"]) / 2)]
patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")

# To simulate that we only have input, half the events
patient_data["events"] = patient_data["events"].iloc[: int(len(patient_data["events"]) / 2)]

In [ ]:

Copied!





forecast_split, events_split = data_splitter.get_splits_from_patient_inference(
    patient_data,
    inference_type="both",
    forecasting_override_variables_to_predict=["hemoglobin_-_718-7"],
    events_override_category="death",
    events_override_observation_time_delta=pd.Timedelta(days=52 * 7),
)
forecast_split, events_split = data_splitter.get_splits_from_patient_inference(
    patient_data,
    inference_type="both",
    forecasting_override_variables_to_predict=["hemoglobin_-_718-7"],
    events_override_category="death",
    events_override_observation_time_delta=pd.Timedelta(days=52 * 7),
)

In [ ]:

Copied!





# We also need to setup when we want to forecast into the future for each variable, in weeks
forecasting_times_to_predict = {
    "hemoglobin_-_718-7": [4, 8, 12],
}
# We also need to setup when we want to forecast into the future for each variable, in weeks
forecasting_times_to_predict = {
    "hemoglobin_-_718-7": [4, 8, 12],
}

In [ ]:

Copied!





# Convert to instruction
converted = converter.forward_conversion_inference(
    forecasting_split=forecast_split,
    forecasting_future_weeks_per_variable=forecasting_times_to_predict,
    event_split=events_split,
    custom_tasks=None,
)
# Convert to instruction
converted = converter.forward_conversion_inference(
    forecasting_split=forecast_split,
    forecasting_future_weeks_per_variable=forecasting_times_to_predict,
    event_split=events_split,
    custom_tasks=None,
)

In [ ]:

Copied!

print(converted["instruction"])
print(converted["instruction"])