Example on how to import MEDS data format¶
InĀ [Ā ]:
Copied!
import pandas as pd
import numpy as np
from datetime import datetime
from twinweaver import (
convert_meds_to_dtc,
DataManager,
DataSplitterEvents,
ConverterInstruction,
Config,
)
import pandas as pd
import numpy as np
from datetime import datetime
from twinweaver import (
convert_meds_to_dtc,
DataManager,
DataSplitterEvents,
ConverterInstruction,
Config,
)
Synethetic example¶
Here we provide synthetic example data as generated by Gemini.
InĀ [Ā ]:
Copied!
code_metadata_list = [
# Static Measurements
{"code": "GENDER/Female", "description": "Female sex"},
{"code": "GENDER/Male", "description": "Male sex"},
{"code": "GENETIC/BRCA1_pos", "description": "BRCA1 gene mutation"},
# Visit and Administrative Codes
{
"code": "ADMISSION/Outpatient",
"description": "Admission for an outpatient clinic visit",
},
{
"code": "ADMISSION/Inpatient",
"description": "Admission to the hospital for an inpatient stay",
},
{
"code": "DISCHARGE/Outpatient",
"description": "Discharge from an outpatient clinic visit",
},
{
"code": "DISCHARGE/Inpatient",
"description": "Discharge from an inpatient hospital stay",
},
{
"code": "NOTE/FollowUp",
"description": "Clinical note for a follow-up appointment",
},
# Diagnosis Codes (ICD-10-CM)
{
"code": "ICD10CM/C34.90",
"description": "Malignant neoplasm of unspecified part of unspecified bronchus or lung",
},
{"code": "ICD10CM/C61", "description": "Malignant neoplasm of prostate"},
# Symptom Codes
{"code": "SYMPTOM/Cough", "description": "Patient reports a persistent cough"},
# Procedure Codes (CPT)
{
"code": "CPT/71250",
"description": "Procedure code for a CT scan of the thorax without contrast",
},
{
"code": "CPT/32408",
"description": "Procedure code for a core needle biopsy of the lung or mediastinum",
},
{
"code": "CPT/55700",
"description": "Procedure code for a needle biopsy of the prostate",
},
{
"code": "CPT/55840",
"description": "Procedure code for a radical retropubic prostatectomy",
},
# Lab Codes (LOINC)
{
"code": "LOINC/6690-2",
"description": "Leukocytes [#/volume] in Blood by Automated count (White Blood Cell Count)",
},
{
"code": "LOINC/2039-6",
"description": "Carcinoembryonic Ag [Mass/volume] in Serum or Plasma (CEA Tumor Marker)",
},
{
"code": "LOINC/59261-8",
"description": "Comprehensive metabolic 2014 panel - Serum or Plasma",
},
{
"code": "LOINC/2857-1",
"description": "Prostate specific Ag [Mass/volume] in Serum or Plasma (PSA Test)",
},
# Medication Codes
{
"code": "RX/Cisplatin",
"description": "Administration of Cisplatin chemotherapy agent",
},
# Death
{"code": "DEATH", "description": "Death"},
]
code_metadata_df = pd.DataFrame(code_metadata_list)
# Patient Events DataFrame
patient_events_list = [
# Patient 101: Jane Doe (Lung Cancer) - Assigned to 'train' split
# Static data
{
"subject_id": 101,
"time": pd.NaT,
"code": "GENDER/Female",
"numeric_value": np.nan,
"text_value": "Female",
},
{
"subject_id": 101,
"time": pd.NaT,
"code": "GENETIC/BRCA1_pos",
"numeric_value": 1,
"text_value": "Positive",
},
# Visit 1 (Week 2, 2024): Diagnosis
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "ADMISSION/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "SYMPTOM/Cough",
"numeric_value": np.nan,
"text_value": "Persistent for 2 months",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "LOINC/6690-2",
"numeric_value": 12.5,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "CPT/71250",
"numeric_value": np.nan,
"text_value": "Nodule found in right lung",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "CPT/32408",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "ICD10CM/C34.90",
"numeric_value": np.nan,
"text_value": "Primary Diagnosis",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "DISCHARGE/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Visit 2 (Week 4, 2024): Treatment
{
"subject_id": 101,
"time": datetime(2024, 1, 22),
"code": "ADMISSION/Inpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 1, 22),
"code": "LOINC/59261-8",
"numeric_value": np.nan,
"text_value": "All values within normal limits",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 22),
"code": "RX/Cisplatin",
"numeric_value": np.nan,
"text_value": "Cisplatin",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 22),
"code": "DISCHARGE/Inpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Visit 3 (Week 8, 2024): Follow-up
{
"subject_id": 101,
"time": datetime(2024, 2, 19),
"code": "ADMISSION/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 2, 19),
"code": "NOTE/FollowUp",
"numeric_value": np.nan,
"text_value": "Patient tolerated first cycle well.",
},
{
"subject_id": 101,
"time": datetime(2024, 2, 19),
"code": "LOINC/2039-6",
"numeric_value": 50.2,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 2, 19),
"code": "DISCHARGE/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Patient 202: John Smith (Prostate Cancer) - Assigned to 'held_out' split
# Static data
{
"subject_id": 202,
"time": pd.NaT,
"code": "GENDER/Male",
"numeric_value": np.nan,
"text_value": "Male",
},
# Visit 1 (Week 10, 2024): Diagnosis
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "ADMISSION/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "LOINC/2857-1",
"numeric_value": 15.1,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "CPT/55700",
"numeric_value": np.nan,
"text_value": "Biopsy taken",
},
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "ICD10CM/C61",
"numeric_value": np.nan,
"text_value": "Primary Diagnosis",
},
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "DISCHARGE/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Visit 2 (Week 14, 2024): Treatment (Surgery)
{
"subject_id": 202,
"time": datetime(2024, 4, 1),
"code": "ADMISSION/Inpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 4, 1),
"code": "CPT/55840",
"numeric_value": np.nan,
"text_value": "Surgical procedure completed.",
},
{
"subject_id": 202,
"time": datetime(2024, 4, 1),
"code": "LOINC/6690-2",
"numeric_value": 8.2,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 4, 1),
"code": "DISCHARGE/Inpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Visit 3 (Week 20, 2024): Follow-up
{
"subject_id": 202,
"time": datetime(2024, 5, 13),
"code": "ADMISSION/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 5, 13),
"code": "LOINC/2857-1",
"numeric_value": 0.1,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 5, 13),
"code": "NOTE/FollowUp",
"numeric_value": np.nan,
"text_value": "PSA levels are undetectable post-op.",
},
{
"subject_id": 202,
"time": datetime(2024, 5, 13),
"code": "DISCHARGE/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2025, 5, 13),
"code": "DEATH",
"numeric_value": np.nan,
"text_value": np.nan,
},
]
patient_events_df = pd.DataFrame(patient_events_list)
patient_events_df["time"] = pd.to_datetime(patient_events_df["time"])
patient_events_df["subject_id"] = patient_events_df["subject_id"].astype(str)
# Subject Splits DataFrame
subject_splits_list = [
{"subject_id": 101, "split": "train"},
{
"subject_id": 202,
"split": "held_out",
}, # 'held_out' is often used for the final test set
]
subject_splits_df = pd.DataFrame(subject_splits_list)
code_metadata_list = [
# Static Measurements
{"code": "GENDER/Female", "description": "Female sex"},
{"code": "GENDER/Male", "description": "Male sex"},
{"code": "GENETIC/BRCA1_pos", "description": "BRCA1 gene mutation"},
# Visit and Administrative Codes
{
"code": "ADMISSION/Outpatient",
"description": "Admission for an outpatient clinic visit",
},
{
"code": "ADMISSION/Inpatient",
"description": "Admission to the hospital for an inpatient stay",
},
{
"code": "DISCHARGE/Outpatient",
"description": "Discharge from an outpatient clinic visit",
},
{
"code": "DISCHARGE/Inpatient",
"description": "Discharge from an inpatient hospital stay",
},
{
"code": "NOTE/FollowUp",
"description": "Clinical note for a follow-up appointment",
},
# Diagnosis Codes (ICD-10-CM)
{
"code": "ICD10CM/C34.90",
"description": "Malignant neoplasm of unspecified part of unspecified bronchus or lung",
},
{"code": "ICD10CM/C61", "description": "Malignant neoplasm of prostate"},
# Symptom Codes
{"code": "SYMPTOM/Cough", "description": "Patient reports a persistent cough"},
# Procedure Codes (CPT)
{
"code": "CPT/71250",
"description": "Procedure code for a CT scan of the thorax without contrast",
},
{
"code": "CPT/32408",
"description": "Procedure code for a core needle biopsy of the lung or mediastinum",
},
{
"code": "CPT/55700",
"description": "Procedure code for a needle biopsy of the prostate",
},
{
"code": "CPT/55840",
"description": "Procedure code for a radical retropubic prostatectomy",
},
# Lab Codes (LOINC)
{
"code": "LOINC/6690-2",
"description": "Leukocytes [#/volume] in Blood by Automated count (White Blood Cell Count)",
},
{
"code": "LOINC/2039-6",
"description": "Carcinoembryonic Ag [Mass/volume] in Serum or Plasma (CEA Tumor Marker)",
},
{
"code": "LOINC/59261-8",
"description": "Comprehensive metabolic 2014 panel - Serum or Plasma",
},
{
"code": "LOINC/2857-1",
"description": "Prostate specific Ag [Mass/volume] in Serum or Plasma (PSA Test)",
},
# Medication Codes
{
"code": "RX/Cisplatin",
"description": "Administration of Cisplatin chemotherapy agent",
},
# Death
{"code": "DEATH", "description": "Death"},
]
code_metadata_df = pd.DataFrame(code_metadata_list)
# Patient Events DataFrame
patient_events_list = [
# Patient 101: Jane Doe (Lung Cancer) - Assigned to 'train' split
# Static data
{
"subject_id": 101,
"time": pd.NaT,
"code": "GENDER/Female",
"numeric_value": np.nan,
"text_value": "Female",
},
{
"subject_id": 101,
"time": pd.NaT,
"code": "GENETIC/BRCA1_pos",
"numeric_value": 1,
"text_value": "Positive",
},
# Visit 1 (Week 2, 2024): Diagnosis
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "ADMISSION/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "SYMPTOM/Cough",
"numeric_value": np.nan,
"text_value": "Persistent for 2 months",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "LOINC/6690-2",
"numeric_value": 12.5,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "CPT/71250",
"numeric_value": np.nan,
"text_value": "Nodule found in right lung",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "CPT/32408",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "ICD10CM/C34.90",
"numeric_value": np.nan,
"text_value": "Primary Diagnosis",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 8),
"code": "DISCHARGE/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Visit 2 (Week 4, 2024): Treatment
{
"subject_id": 101,
"time": datetime(2024, 1, 22),
"code": "ADMISSION/Inpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 1, 22),
"code": "LOINC/59261-8",
"numeric_value": np.nan,
"text_value": "All values within normal limits",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 22),
"code": "RX/Cisplatin",
"numeric_value": np.nan,
"text_value": "Cisplatin",
},
{
"subject_id": 101,
"time": datetime(2024, 1, 22),
"code": "DISCHARGE/Inpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Visit 3 (Week 8, 2024): Follow-up
{
"subject_id": 101,
"time": datetime(2024, 2, 19),
"code": "ADMISSION/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 2, 19),
"code": "NOTE/FollowUp",
"numeric_value": np.nan,
"text_value": "Patient tolerated first cycle well.",
},
{
"subject_id": 101,
"time": datetime(2024, 2, 19),
"code": "LOINC/2039-6",
"numeric_value": 50.2,
"text_value": np.nan,
},
{
"subject_id": 101,
"time": datetime(2024, 2, 19),
"code": "DISCHARGE/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Patient 202: John Smith (Prostate Cancer) - Assigned to 'held_out' split
# Static data
{
"subject_id": 202,
"time": pd.NaT,
"code": "GENDER/Male",
"numeric_value": np.nan,
"text_value": "Male",
},
# Visit 1 (Week 10, 2024): Diagnosis
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "ADMISSION/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "LOINC/2857-1",
"numeric_value": 15.1,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "CPT/55700",
"numeric_value": np.nan,
"text_value": "Biopsy taken",
},
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "ICD10CM/C61",
"numeric_value": np.nan,
"text_value": "Primary Diagnosis",
},
{
"subject_id": 202,
"time": datetime(2024, 3, 4),
"code": "DISCHARGE/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Visit 2 (Week 14, 2024): Treatment (Surgery)
{
"subject_id": 202,
"time": datetime(2024, 4, 1),
"code": "ADMISSION/Inpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 4, 1),
"code": "CPT/55840",
"numeric_value": np.nan,
"text_value": "Surgical procedure completed.",
},
{
"subject_id": 202,
"time": datetime(2024, 4, 1),
"code": "LOINC/6690-2",
"numeric_value": 8.2,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 4, 1),
"code": "DISCHARGE/Inpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
# Visit 3 (Week 20, 2024): Follow-up
{
"subject_id": 202,
"time": datetime(2024, 5, 13),
"code": "ADMISSION/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 5, 13),
"code": "LOINC/2857-1",
"numeric_value": 0.1,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2024, 5, 13),
"code": "NOTE/FollowUp",
"numeric_value": np.nan,
"text_value": "PSA levels are undetectable post-op.",
},
{
"subject_id": 202,
"time": datetime(2024, 5, 13),
"code": "DISCHARGE/Outpatient",
"numeric_value": np.nan,
"text_value": np.nan,
},
{
"subject_id": 202,
"time": datetime(2025, 5, 13),
"code": "DEATH",
"numeric_value": np.nan,
"text_value": np.nan,
},
]
patient_events_df = pd.DataFrame(patient_events_list)
patient_events_df["time"] = pd.to_datetime(patient_events_df["time"])
patient_events_df["subject_id"] = patient_events_df["subject_id"].astype(str)
# Subject Splits DataFrame
subject_splits_list = [
{"subject_id": 101, "split": "train"},
{
"subject_id": 202,
"split": "held_out",
}, # 'held_out' is often used for the final test set
]
subject_splits_df = pd.DataFrame(subject_splits_list)
InĀ [Ā ]:
Copied!
patient_events_df
patient_events_df
Conversion to TwinWeaver format¶
InĀ [Ā ]:
Copied!
# Here we set a demo mapping for the event_category column - if not provided it uses a default
# This is useful especially for cases when generating custom training data for LLMs
demo_mapping = {
"SYMPTOM/Cough": "symptom",
"ICD10CM/C34.90": "diagnosis",
"DEATH": "death",
"RX/Cisplatin": "lot",
}
# Here we set a demo mapping for the event_category column - if not provided it uses a default
# This is useful especially for cases when generating custom training data for LLMs
demo_mapping = {
"SYMPTOM/Cough": "symptom",
"ICD10CM/C34.90": "diagnosis",
"DEATH": "death",
"RX/Cisplatin": "lot",
}
InĀ [Ā ]:
Copied!
#: Do actual conversion
df_converted_constant, df_converted_constant_description, df_converted_events = convert_meds_to_dtc(
df_codes=code_metadata_df,
df_data=patient_events_df,
df_split=subject_splits_df,
prefer_text_value_over_numeric=True,
event_category_mapping=demo_mapping,
no_value_default="observed",
)
#: Do actual conversion
df_converted_constant, df_converted_constant_description, df_converted_events = convert_meds_to_dtc(
df_codes=code_metadata_df,
df_data=patient_events_df,
df_split=subject_splits_df,
prefer_text_value_over_numeric=True,
event_category_mapping=demo_mapping,
no_value_default="observed",
)
InĀ [Ā ]:
Copied!
df_converted_events
df_converted_events
InĀ [Ā ]:
Copied!
# Get for future use
constant_columns = df_converted_constant.columns.tolist()
constant_columns = [x for x in constant_columns if x not in ["patientid"]]
# Get for future use
constant_columns = df_converted_constant.columns.tolist()
constant_columns = [x for x in constant_columns if x not in ["patientid"]]
Example usage in twinweaver package¶
Here we're showing an example for inference (i.e. using a pretrained model), but check out the other examples if you need to e.g. generate training data.
InĀ [Ā ]:
Copied!
# Set basics
indication = "meds_demo"
config = Config() # Override values here to customize pipeline
config.constant_columns_to_use = constant_columns
config.constant_birthdate_column = None # Not using in demo
config.event_value_lot_start = None
config.split_event_category = "lot"
config.event_category_events_prediction_with_naming = {
"death": "death",
}
# Set basics
indication = "meds_demo"
config = Config() # Override values here to customize pipeline
config.constant_columns_to_use = constant_columns
config.constant_birthdate_column = None # Not using in demo
config.event_value_lot_start = None
config.split_event_category = "lot"
config.event_category_events_prediction_with_naming = {
"death": "death",
}
InĀ [Ā ]:
Copied!
# Setup basics
dm = DataManager(config=config)
dm.load_indication_data(
df_events=df_converted_events,
df_constant=df_converted_constant,
df_constant_description=df_converted_constant_description,
)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_hold_out_sets(validation_split=0.1, test_split=0.1)
data_splitter_events = DataSplitterEvents(
dm,
config=config,
max_length_to_sample=pd.Timedelta(weeks=104),
min_length_to_sample=pd.Timedelta(weeks=1),
)
data_splitter_events.setup_variables()
converter = ConverterInstruction(
nr_tokens_budget_total=8192,
config=config,
dm=dm,
)
# Setup basics
dm = DataManager(config=config)
dm.load_indication_data(
df_events=df_converted_events,
df_constant=df_converted_constant,
df_constant_description=df_converted_constant_description,
)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_hold_out_sets(validation_split=0.1, test_split=0.1)
data_splitter_events = DataSplitterEvents(
dm,
config=config,
max_length_to_sample=pd.Timedelta(weeks=104),
min_length_to_sample=pd.Timedelta(weeks=1),
)
data_splitter_events.setup_variables()
converter = ConverterInstruction(
nr_tokens_budget_total=8192,
config=config,
dm=dm,
)
InĀ [Ā ]:
Copied!
# Set example patient
patientid = "101"
# Get data
patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")
# Here then split date
split_date = patient_data["events"]["date"].iloc[-1]
# Generate splits to predict whether death will occur in the next 52 weeks
events_splits = data_splitter_events.get_splits_from_patient(
patient_data,
max_nr_samples_per_split=1,
override_split_dates=[split_date],
override_category="death",
override_observation_time_delta=pd.Timedelta(weeks=52),
)
events_split = events_splits[0][0]
#: no forecasting split
forecast_split = None
forecasting_times_to_predict = None
# Convert to instruction
converted = converter.forward_conversion_inference(
forecasting_split=forecast_split,
forecasting_future_weeks_per_variable=forecasting_times_to_predict,
event_split=events_split,
custom_tasks=None,
)
print(converted["instruction"])
# Set example patient
patientid = "101"
# Get data
patient_data = dm.get_patient_data(patientid)
patient_data["events"] = patient_data["events"].sort_values("date")
# Here then split date
split_date = patient_data["events"]["date"].iloc[-1]
# Generate splits to predict whether death will occur in the next 52 weeks
events_splits = data_splitter_events.get_splits_from_patient(
patient_data,
max_nr_samples_per_split=1,
override_split_dates=[split_date],
override_category="death",
override_observation_time_delta=pd.Timedelta(weeks=52),
)
events_split = events_splits[0][0]
#: no forecasting split
forecast_split = None
forecasting_times_to_predict = None
# Convert to instruction
converted = converter.forward_conversion_inference(
forecasting_split=forecast_split,
forecasting_future_weeks_per_variable=forecasting_times_to_predict,
event_split=events_split,
custom_tasks=None,
)
print(converted["instruction"])