Example for custom split and forecasting events¶
This notebook demonstrates how to adjust the splitters to split at custom events, as well to forecast different categories (rather than the default labs).
InĀ [Ā ]:
Copied!
import pandas as pd
from twinweaver import (
DataManager,
Config,
DataSplitterForecasting,
DataSplitterEvents,
ConverterInstruction,
DataSplitter,
)
import pandas as pd
from twinweaver import (
DataManager,
Config,
DataSplitterForecasting,
DataSplitterEvents,
ConverterInstruction,
DataSplitter,
)
Basic Setup¶
Load Data¶
InĀ [Ā ]:
Copied!
# Load data - generated example data
df_events = pd.read_csv("../../example_data/events.csv")
df_constant = pd.read_csv("../../example_data/constant.csv")
df_constant_description = pd.read_csv("../../example_data/constant_description.csv")
# Load data - generated example data
df_events = pd.read_csv("../../example_data/events.csv")
df_constant = pd.read_csv("../../example_data/constant.csv")
df_constant_description = pd.read_csv("../../example_data/constant_description.csv")
Configuration and Data Manager¶
InĀ [Ā ]:
Copied!
config = Config() # Override values here to customize pipeline
config.constant_columns_to_use = [
"birthyear",
"gender",
"histology",
"smoking_history",
] # Manually set from constant DF
config.constant_birthdate_column = "birthyear"
# <---------------------- IMPORTANT PARTS ---------------------------->
# To setup the different split events, we set this in the config
# In this example, we use genetic events as custom split events
config.split_event_category = "basic_biomarker"
# And to forecast different categories, we set this in the config as well
# In this example, lets say we want to forecast vitals (i.e. body weight in the example data)
config.event_category_forecast = ["vitals"]
# To predict different variables for the event categories, we set up a mapping here
config.event_category_events_prediction_with_naming = {
"lot": "time to next lot", # Custom name in prompt: "time to next lot" instead of "lot"
}
config = Config() # Override values here to customize pipeline
config.constant_columns_to_use = [
"birthyear",
"gender",
"histology",
"smoking_history",
] # Manually set from constant DF
config.constant_birthdate_column = "birthyear"
# <---------------------- IMPORTANT PARTS ---------------------------->
# To setup the different split events, we set this in the config
# In this example, we use genetic events as custom split events
config.split_event_category = "basic_biomarker"
# And to forecast different categories, we set this in the config as well
# In this example, lets say we want to forecast vitals (i.e. body weight in the example data)
config.event_category_forecast = ["vitals"]
# To predict different variables for the event categories, we set up a mapping here
config.event_category_events_prediction_with_naming = {
"lot": "time to next lot", # Custom name in prompt: "time to next lot" instead of "lot"
}
InĀ [Ā ]:
Copied!
# Setup the data manager
dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_hold_out_sets(validation_split=0.1, test_split=0.1)
dm.infer_var_types()
# Setup the data manager
dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_hold_out_sets(validation_split=0.1, test_split=0.1)
dm.infer_var_types()
Initialize Splitters and Converter¶
InĀ [Ā ]:
Copied!
# This data splitter handles event prediction tasks
data_splitter_events = DataSplitterEvents(
dm,
config=config,
max_length_to_sample=pd.Timedelta(weeks=104),
min_length_to_sample=pd.Timedelta(weeks=1),
)
data_splitter_events.setup_variables()
# This data splitter handles forecasting tasks
data_splitter_forecasting = DataSplitterForecasting(
data_manager=dm,
config=config,
max_forecasted_trajectory_length=pd.Timedelta(days=90),
)
# If you don't want to do forecasting QA, proportional sampling, or 3-sigma filtering, you can skip this step
data_splitter_forecasting.setup_statistics()
# We will also use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)
# Set up the converter instruction
converter = ConverterInstruction(
nr_tokens_budget_total=8192,
config=config,
dm=dm,
variable_stats=data_splitter_forecasting.variable_stats, # Optional, needed for forecasting QA tasks
)
# This data splitter handles event prediction tasks
data_splitter_events = DataSplitterEvents(
dm,
config=config,
max_length_to_sample=pd.Timedelta(weeks=104),
min_length_to_sample=pd.Timedelta(weeks=1),
)
data_splitter_events.setup_variables()
# This data splitter handles forecasting tasks
data_splitter_forecasting = DataSplitterForecasting(
data_manager=dm,
config=config,
max_forecasted_trajectory_length=pd.Timedelta(days=90),
)
# If you don't want to do forecasting QA, proportional sampling, or 3-sigma filtering, you can skip this step
data_splitter_forecasting.setup_statistics()
# We will also use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)
# Set up the converter instruction
converter = ConverterInstruction(
nr_tokens_budget_total=8192,
config=config,
dm=dm,
variable_stats=data_splitter_forecasting.variable_stats, # Optional, needed for forecasting QA tasks
)
Examine patient data¶
From the data manager we can get the patient, for example this patientid.
InĀ [Ā ]:
Copied!
patientid = dm.all_patientids[4]
patient_data = dm.get_patient_data(patientid)
patientid = dm.all_patientids[4]
patient_data = dm.get_patient_data(patientid)
Convert patient data to string¶
Generate Training Splits¶
InĀ [Ā ]:
Copied!
forecasting_splits, events_splits, reference_dates = data_splitter.get_splits_from_patient_with_target(
patient_data,
)
# Note, forecasting_splits will be none here
forecasting_splits, events_splits, reference_dates = data_splitter.get_splits_from_patient_with_target(
patient_data,
)
# Note, forecasting_splits will be none here
Now for each split, we can generate these strings. We just pick the first one as an example.
InĀ [Ā ]:
Copied!
split_idx = 0
p_converted = converter.forward_conversion(
forecasting_splits=None, # Set to None since we don't want to generate forecasting tasks
event_splits=events_splits[split_idx],
)
split_idx = 0
p_converted = converter.forward_conversion(
forecasting_splits=None, # Set to None since we don't want to generate forecasting tasks
event_splits=events_splits[split_idx],
)
InĀ [Ā ]:
Copied!
forecasting_splits[0]
forecasting_splits[0]
Inspect the Output¶
InĀ [Ā ]:
Copied!
print(p_converted["instruction"])
print(p_converted["instruction"])
InĀ [Ā ]:
Copied!
print(p_converted["answer"])
print(p_converted["answer"])
Reverse Conversion: Text to Structured Data¶
InĀ [Ā ]:
Copied!
date = reference_dates["date"][0]
return_list = converter.reverse_conversion(p_converted["answer"], dm, date)
return_list[0]["result"]
date = reference_dates["date"][0]
return_list = converter.reverse_conversion(p_converted["answer"], dm, date)
return_list[0]["result"]