Example of running inference¶
In [ ]:
Copied!
import pandas as pd
from twinweaver import (
DataSplitterForecasting,
DataManager,
DataSplitterEvents,
ConverterInstruction,
Config,
DataSplitter,
)
import pandas as pd
from twinweaver import (
DataSplitterForecasting,
DataManager,
DataSplitterEvents,
ConverterInstruction,
Config,
DataSplitter,
)
Setup basic data loading¶
In [ ]:
Copied!
df_events = pd.read_csv("./example_data/events.csv")
df_constant = pd.read_csv("./example_data/constant.csv")
df_constant_description = pd.read_csv("./example_data/constant_description.csv")
df_events = pd.read_csv("./example_data/events.csv")
df_constant = pd.read_csv("./example_data/constant.csv")
df_constant_description = pd.read_csv("./example_data/constant_description.csv")
In [ ]:
Copied!
config = Config() # Override values here to customize pipeline
config.constant_columns_to_use = ["birthyear", "gender", "histology", "smoking_history"] # Manually set from constant
config.constant_birthdate_column = "birthyear"
dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_dataset_splits()
dm.infer_var_types()
data_splitter_events = DataSplitterEvents(dm, config=config)
data_splitter_events.setup_variables()
data_splitter_forecasting = DataSplitterForecasting(
data_manager=dm,
config=config,
)
# In case you manually want to override the variables for forecasting selectiong, you can skip this next line.
data_splitter_forecasting.setup_statistics()
# We will use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)
converter = ConverterInstruction(
nr_tokens_budget_total=8192,
config=config,
dm=dm,
variable_stats=data_splitter_forecasting.variable_stats, # Optional, needed for forecasting QA tasks
)
config = Config() # Override values here to customize pipeline
config.constant_columns_to_use = ["birthyear", "gender", "histology", "smoking_history"] # Manually set from constant
config.constant_birthdate_column = "birthyear"
dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_dataset_splits()
dm.infer_var_types()
data_splitter_events = DataSplitterEvents(dm, config=config)
data_splitter_events.setup_variables()
data_splitter_forecasting = DataSplitterForecasting(
data_manager=dm,
config=config,
)
# In case you manually want to override the variables for forecasting selectiong, you can skip this next line.
data_splitter_forecasting.setup_statistics()
# We will use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)
converter = ConverterInstruction(
nr_tokens_budget_total=8192,
config=config,
dm=dm,
variable_stats=data_splitter_forecasting.variable_stats, # Optional, needed for forecasting QA tasks
)
Example patient data¶
In [ ]:
Copied!
patientid = dm.all_patientids[2]
patientid
patientid = dm.all_patientids[2]
patientid
In [ ]:
Copied!
patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")
# To simulate that we only have input, half the events
patient_data["events"] = patient_data["events"].iloc[: int(len(patient_data["events"]) / 2)]
patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")
# To simulate that we only have input, half the events
patient_data["events"] = patient_data["events"].iloc[: int(len(patient_data["events"]) / 2)]
In [ ]:
Copied!
forecast_split, events_split = data_splitter.get_splits_from_patient_inference(
patient_data,
inference_type="both",
forecasting_override_variables_to_predict=["hemoglobin_-_718-7"],
events_override_category="death",
events_override_observation_time_delta=pd.Timedelta(days=52 * 7),
)
forecast_split, events_split = data_splitter.get_splits_from_patient_inference(
patient_data,
inference_type="both",
forecasting_override_variables_to_predict=["hemoglobin_-_718-7"],
events_override_category="death",
events_override_observation_time_delta=pd.Timedelta(days=52 * 7),
)
In [ ]:
Copied!
# We also need to setup when we want to forecast into the future for each variable, in weeks
forecasting_times_to_predict = {
"hemoglobin_-_718-7": [4, 8, 12],
}
# We also need to setup when we want to forecast into the future for each variable, in weeks
forecasting_times_to_predict = {
"hemoglobin_-_718-7": [4, 8, 12],
}
In [ ]:
Copied!
# Convert to instruction
converted = converter.forward_conversion_inference(
forecasting_split=forecast_split,
forecasting_future_weeks_per_variable=forecasting_times_to_predict,
event_split=events_split,
custom_tasks=None,
)
# Convert to instruction
converted = converter.forward_conversion_inference(
forecasting_split=forecast_split,
forecasting_future_weeks_per_variable=forecasting_times_to_predict,
event_split=events_split,
custom_tasks=None,
)
In [ ]:
Copied!
print(converted["instruction"])
print(converted["instruction"])