Example of running inference¶
InĀ [Ā ]:
Copied!
import pandas as pd
from twinweaver import (
DataSplitterForecasting,
DataManager,
DataSplitterEvents,
ConverterInstruction,
Config,
DataSplitter,
)
import pandas as pd
from twinweaver import (
DataSplitterForecasting,
DataManager,
DataSplitterEvents,
ConverterInstruction,
Config,
DataSplitter,
)
Setup basic data loading¶
InĀ [Ā ]:
Copied!
df_events = pd.read_csv("./example_data/events.csv")
df_constant = pd.read_csv("./example_data/constant.csv")
df_constant_description = pd.read_csv("./example_data/constant_description.csv")
df_events = pd.read_csv("./example_data/events.csv")
df_constant = pd.read_csv("./example_data/constant.csv")
df_constant_description = pd.read_csv("./example_data/constant_description.csv")
InĀ [Ā ]:
Copied!
config = Config() # Override values here to customize pipeline
# <---------------------- CRITICAL CONFIGURATION ---------------------->
# 1. Event category used for data splitting (e.g., split data around Lines of Therapy 'lot')
# Has to be set for all instruction tasks
config.split_event_category = "lot"
# 2. List of event categories we want to forecast (e.g., forecasting 'lab' values)
# Only needs to be set if you want to forecast variables
config.event_category_forecast = ["lab"]
# 3. Mapping of specific time to events to predict (e.g., we want to predict 'death' and 'progression')
# Only needs to be set if you want to do time to event prediction
config.event_category_events_prediction_with_naming = {
"death": "death",
"progression": "next progression", # Custom name in prompt: "next progression" instead of "progression"
}
# Optional configs, here to correctly use the static information
config.constant_columns_to_use = ["birthyear", "gender", "histology", "smoking_history"] # Manually set from constant
config.constant_birthdate_column = "birthyear"
config = Config() # Override values here to customize pipeline
# <---------------------- CRITICAL CONFIGURATION ---------------------->
# 1. Event category used for data splitting (e.g., split data around Lines of Therapy 'lot')
# Has to be set for all instruction tasks
config.split_event_category = "lot"
# 2. List of event categories we want to forecast (e.g., forecasting 'lab' values)
# Only needs to be set if you want to forecast variables
config.event_category_forecast = ["lab"]
# 3. Mapping of specific time to events to predict (e.g., we want to predict 'death' and 'progression')
# Only needs to be set if you want to do time to event prediction
config.event_category_events_prediction_with_naming = {
"death": "death",
"progression": "next progression", # Custom name in prompt: "next progression" instead of "progression"
}
# Optional configs, here to correctly use the static information
config.constant_columns_to_use = ["birthyear", "gender", "histology", "smoking_history"] # Manually set from constant
config.constant_birthdate_column = "birthyear"
InĀ [Ā ]:
Copied!
dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_hold_out_sets(validation_split=0.1, test_split=0.1)
dm.infer_var_types()
data_splitter_events = DataSplitterEvents(
dm,
config=config,
max_length_to_sample=pd.Timedelta(weeks=104),
min_length_to_sample=pd.Timedelta(weeks=1),
)
data_splitter_events.setup_variables()
data_splitter_forecasting = DataSplitterForecasting(
data_manager=dm,
config=config,
max_forecasted_trajectory_length=pd.Timedelta(days=90),
)
# In case you manually want to override the variables for forecasting selectiong, you can skip this next line.
data_splitter_forecasting.setup_statistics()
# We will use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)
converter = ConverterInstruction(
nr_tokens_budget_total=8192,
config=config,
dm=dm,
variable_stats=data_splitter_forecasting.variable_stats, # Optional, needed for forecasting QA tasks
)
dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_hold_out_sets(validation_split=0.1, test_split=0.1)
dm.infer_var_types()
data_splitter_events = DataSplitterEvents(
dm,
config=config,
max_length_to_sample=pd.Timedelta(weeks=104),
min_length_to_sample=pd.Timedelta(weeks=1),
)
data_splitter_events.setup_variables()
data_splitter_forecasting = DataSplitterForecasting(
data_manager=dm,
config=config,
max_forecasted_trajectory_length=pd.Timedelta(days=90),
)
# In case you manually want to override the variables for forecasting selectiong, you can skip this next line.
data_splitter_forecasting.setup_statistics()
# We will use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)
converter = ConverterInstruction(
nr_tokens_budget_total=8192,
config=config,
dm=dm,
variable_stats=data_splitter_forecasting.variable_stats, # Optional, needed for forecasting QA tasks
)
Example patient data¶
InĀ [Ā ]:
Copied!
patientid = dm.all_patientids[2]
patientid
patientid = dm.all_patientids[2]
patientid
InĀ [Ā ]:
Copied!
patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")
# To simulate that we only have input, half the events
patient_data["events"] = patient_data["events"].iloc[: int(len(patient_data["events"]) / 2)]
patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")
# To simulate that we only have input, half the events
patient_data["events"] = patient_data["events"].iloc[: int(len(patient_data["events"]) / 2)]
InĀ [Ā ]:
Copied!
forecast_split, events_split = data_splitter.get_splits_from_patient_inference(
patient_data,
inference_type="both",
forecasting_override_variables_to_predict=["hemoglobin_-_718-7"],
events_override_category="death",
events_override_observation_time_delta=pd.Timedelta(days=52 * 7),
)
forecast_split, events_split = data_splitter.get_splits_from_patient_inference(
patient_data,
inference_type="both",
forecasting_override_variables_to_predict=["hemoglobin_-_718-7"],
events_override_category="death",
events_override_observation_time_delta=pd.Timedelta(days=52 * 7),
)
InĀ [Ā ]:
Copied!
# We also need to setup when we want to forecast into the future for each variable, in weeks
# By default the system tries to map the name below to event_name, and backup just uses the provided value
forecasting_times_to_predict = {
"hemoglobin_-_718-7": [4, 8, 12],
}
# We also need to setup when we want to forecast into the future for each variable, in weeks
# By default the system tries to map the name below to event_name, and backup just uses the provided value
forecasting_times_to_predict = {
"hemoglobin_-_718-7": [4, 8, 12],
}
InĀ [Ā ]:
Copied!
# Convert to instruction
converted = converter.forward_conversion_inference(
forecasting_split=forecast_split,
forecasting_future_weeks_per_variable=forecasting_times_to_predict,
event_split=events_split,
custom_tasks=None,
)
# Convert to instruction
converted = converter.forward_conversion_inference(
forecasting_split=forecast_split,
forecasting_future_weeks_per_variable=forecasting_times_to_predict,
event_split=events_split,
custom_tasks=None,
)
InĀ [Ā ]:
Copied!
print(converted["instruction"])
print(converted["instruction"])