---
title: "ICD10 Atlas Generation"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{ICD10 Atlas Generation}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

## Goal

This vignette documents the code used to generate an ICD10 three-digit
`CohortContrast` atlas across multiple observation windows.

The resulting atlas based on EstHealth30 data can be explored at
<http://omop-apps.cloud.ut.ee/CohortContrastAtlas/>.

The workflow runs the same analysis for all eligible three-digit ICD10 codes
and produces summary-mode study folders for three observation periods:

- diagnosis day `0` to `365`
- diagnosis day `0` to `30`
- diagnosis day `-90` to `0`

This pipeline keeps only the final summary folders.

## Required environment variables

```{r, eval = FALSE}
# Database connection settings expected by the helper below.
Sys.getenv("DB_NAME")
Sys.getenv("DB_HOST")
Sys.getenv("DB_PORT")
Sys.getenv("DB_USERNAME")
Sys.getenv("DB_PASSWORD")

# OMOP schema settings.
Sys.getenv("OHDSI_CDM")
Sys.getenv("OHDSI_RESULTS")
Sys.getenv("OHDSI_WRITE")
```

## Connect to the OMOP database

```{r, eval = FALSE}
library(CohortContrast)
library(CDMConnector)
library(DBI)
library(RPostgres)
library(dplyr)
library(purrr)
library(tibble)
library(icd.data)

connectAtlasCdm <- function() {
  # Open the database connection from environment variables instead of
  # hard-coded credentials.
  db <- DBI::dbConnect(
    RPostgres::Postgres(),
    dbname = Sys.getenv("DB_NAME"),
    host = Sys.getenv("DB_HOST"),
    user = Sys.getenv("DB_USERNAME"),
    password = Sys.getenv("DB_PASSWORD"),
    port = as.integer(Sys.getenv("DB_PORT"))
  )

  # Create the CDMConnector object used by CohortContrast.
  cdm <- CDMConnector::cdmFromCon(
    con = db,
    cdmSchema = Sys.getenv("OHDSI_CDM"),
    achillesSchema = Sys.getenv("OHDSI_RESULTS"),
    writeSchema = c(
      schema = Sys.getenv("OHDSI_WRITE"),
      prefix = "cc_"
    )
  )

  list(db = db, cdm = cdm)
}
```

## Define the observation windows

```{r, eval = FALSE}
# Each row defines one observation window relative to the first qualifying
# diagnosis date and one matching control-window lag.
windows <- tibble::tribble(
  ~window_name,   ~target_start_offset, ~target_end_offset, ~control_lag_days, ~minimum_observation_days,
  "after_1year",                   0L,                365L,             1095L,                     1095L,
  "after_30days",                  0L,                 30L,              1095L,                      1095L,
  "prior_90days",                -90L,                  0L,              1095L,                      1095L
)
```

## Identify eligible ICD10 three-digit codes

```{r, eval = FALSE}
getEligibleIcd10Codes <- function(cdm, minPatients = 100L) {
  # Keep only three-digit ICD10 codes from the icd.data lookup table.
  icd10Lookup <- icd.data::icd10cm2016 %>%
    dplyr::filter(nchar(.data$code) < 4) %>%
    dplyr::distinct(.data$code, .data$long_desc)

  # Count patients per three-digit source code in the OMOP condition table.
  counts <- cdm$condition_occurrence %>%
    dplyr::mutate(icd10code = substr(.data$condition_source_value, 1, 3)) %>%
    dplyr::group_by(.data$icd10code) %>%
    dplyr::summarise(n = dplyr::n_distinct(.data$person_id), .groups = "drop") %>%
    dplyr::collect() %>%
    dplyr::filter(.data$n >= minPatients)

  dplyr::inner_join(
    counts,
    icd10Lookup,
    by = c("icd10code" = "code")
  )
}
```

## Helper for clean study names

```{r, eval = FALSE}
makeSafeStudyName <- function(code, label, windowName) {
  raw <- paste(code, label, windowName, sep = "_")
  raw <- tolower(raw)
  raw <- gsub("[^a-z0-9]+", "_", raw)
  raw <- gsub("^_+|_+$", "", raw)
  raw
}
```

## Build the target cohort

```{r, eval = FALSE}
buildIcd10TargetTable <- function(cdm, icd10code, targetStartOffset, targetEndOffset,
                                  minimumObservationDays) {
  targetStartOffset <- as.integer(targetStartOffset)
  targetEndOffset <- as.integer(targetEndOffset)
  minimumObservationDays <- as.integer(minimumObservationDays)

  # Restrict to the selected ICD10 three-digit source code.
  conditionData <- cdm$condition_occurrence %>%
    dplyr::filter(substr(.data$condition_source_value, 1, 3) == icd10code) %>%
    dplyr::collect()

  # Keep the first qualifying diagnosis event per person as the baseline anchor.
  firstDiagnosis <- conditionData %>%
    dplyr::group_by(.data$person_id) %>%
    dplyr::slice_min(order_by = .data$condition_start_date, n = 1, with_ties = FALSE) %>%
    dplyr::ungroup() %>%
    dplyr::transmute(
      person_id = .data$person_id,
      anchor_date = .data$condition_start_date
    )

  # Ensure adequate observation time before the anchor date.
  observationPeriod <- cdm$observation_period %>%
    dplyr::collect() %>%
    dplyr::select(
      person_id,
      observation_period_start_date,
      observation_period_end_date
    )

  eligible <- firstDiagnosis %>%
    dplyr::left_join(observationPeriod, by = "person_id") %>%
    dplyr::filter(
      as.numeric(.data$anchor_date - .data$observation_period_start_date) > minimumObservationDays
    )

  targetTable <- eligible %>%
    dplyr::transmute(
      cohort_definition_id = "target",
      subject_id = .data$person_id,
      cohort_start_date = .data$anchor_date + targetStartOffset,
      cohort_end_date = .data$anchor_date + targetEndOffset
    ) %>%
    dplyr::filter(.data$cohort_start_date < .data$cohort_end_date)

  CohortContrast::resolveCohortTableOverlaps(
    cohortTable = targetTable,
    cdm = cdm
  )
}
```

## Build the visit-based control cohort

```{r, eval = FALSE}
buildIcd10VisitControl <- function(cdm, targetTable, controlLagDays) {
  controlLagDays <- as.integer(controlLagDays)
  followupDays <- as.integer(
    unique(targetTable$cohort_end_date - targetTable$cohort_start_date)
  )

  # Shift the target windows backwards to define the control-search anchor.
  controlSeed <- targetTable %>%
    dplyr::mutate(
      cohort_start_date = cohort_start_date - controlLagDays,
      cohort_end_date = cohort_end_date - controlLagDays
    )

  cdm <- CDMConnector::insertTable(
    cdm = cdm,
    name = "icd10_control_seed",
    table = controlSeed,
    overwrite = TRUE,
    temporary = TRUE
  )

  controlTable <- cdm$visit_occurrence %>%
    dplyr::inner_join(cdm$icd10_control_seed, by = c("person_id" = "subject_id")) %>%
    dplyr::filter(visit_start_date <= cohort_start_date) %>%
    dplyr::group_by(person_id) %>%
    # To reduce bias from differential encounter density, align each patient's
    # control window to begin at the start of the clinical visit closest to the
    # baseline anchor date, operationalized here as the latest eligible visit
    # on or before the shifted anchor date.
    dplyr::slice_max(order_by = visit_start_date, n = 1, with_ties = FALSE) %>%
    dplyr::ungroup() %>%
    dplyr::transmute(
      subject_id = .data$person_id,
      cohort_start_date = .data$visit_start_date
    ) %>%
    dplyr::collect() %>%
    dplyr::mutate(
      cohort_definition_id = "control",
      cohort_end_date = .data$cohort_start_date + followupDays
    ) %>%
    dplyr::select(
      cohort_definition_id,
      subject_id,
      cohort_start_date,
      cohort_end_date
    ) %>%
    dplyr::filter(.data$cohort_start_date < .data$cohort_end_date)

  CohortContrast::resolveCohortTableOverlaps(
    cohortTable = controlTable,
    cdm = cdm
  )
}
```

## Keep only the final selected concepts

```{r, eval = FALSE}
trimFinalStudy <- function(data) {
  # selectedFeatureData stores the concepts retained after statistical
  # filtering and optional post-processing.
  finalIds <- unique(data$selectedFeatureData$selectedFeatureIds)

  data$data_features <- data$data_features %>%
    dplyr::filter(.data$CONCEPT_ID %in% finalIds)

  data$data_patients <- data$data_patients %>%
    dplyr::filter(.data$CONCEPT_ID %in% finalIds)

  data$complementaryMappingTable <- data$complementaryMappingTable %>%
    dplyr::filter(
      .data$CONCEPT_ID %in% finalIds |
        .data$NEW_CONCEPT_ID %in% finalIds
    )

  data$selectedFeatureData$selectedFeatures <- data$selectedFeatureData$selectedFeatures %>%
    dplyr::filter(.data$CONCEPT_ID %in% finalIds)
  data$selectedFeatureData$selectedFeatureIds <- finalIds
  data$selectedFeatureData$selectedFeatureNames <- unique(
    data$selectedFeatureData$selectedFeatures$CONCEPT_NAME
  )
  data$trajectoryDataList <- data$selectedFeatureData

  data
}
```

## Run one ICD10 code and one observation window

```{r, eval = FALSE}
runIcd10AtlasCase <- function(cdm, icd10code, longDesc, windowConfig,
                              summaryRoot, scratchRoot) {
  studyName <- makeSafeStudyName(
    code = icd10code,
    label = longDesc,
    windowName = windowConfig$window_name
  )

  # Build the target and control cohorts for the selected ICD10 code and
  # observation window.
  targetTable <- buildIcd10TargetTable(
    cdm = cdm,
    icd10code = icd10code,
    targetStartOffset = windowConfig$target_start_offset,
    targetEndOffset = windowConfig$target_end_offset,
    minimumObservationDays = windowConfig$minimum_observation_days
  )

  controlTable <- buildIcd10VisitControl(
    cdm = cdm,
    targetTable = targetTable,
    controlLagDays = windowConfig$control_lag_days
  )

  # Skip extremely small analyses.
  if (nrow(targetTable) < 100 || nrow(controlTable) < 5) {
    return(invisible(NULL))
  }

  # Run CohortContrast in memory first.
  result <- CohortContrast::CohortContrast(
    cdm = cdm,
    targetTable = targetTable,
    controlTable = controlTable,
    pathToResults = scratchRoot,
    domainsIncluded = c(
      "Drug",
      "Condition",
      "Measurement",
      "Observation",
      "Procedure",
      "Visit",
      "Visit detail",
      "Death"
    ),
    prevalenceCutOff = 1,
    topK = FALSE,
    getSourceData = FALSE,
    runChi2YTests = TRUE,
    runLogitTests = TRUE,
    createOutputFiles = FALSE,
    complName = studyName,
    runRemoveTemporalBias = TRUE,
    removeTemporalBiasArgs = list(
      removeIdentified = TRUE,
      ratio = 1
    ),
    runAutomaticHierarchyCombineConcepts = TRUE,
    automaticHierarchyCombineConceptsArgs = list(
      abstractionLevel = -1,
      minDepthAllowed = 0,
      allowOnlyMinors = TRUE
    ),
    runAutomaticCorrelationCombineConcepts = TRUE,
    automaticCorrelationCombineConceptsArgs = list(
      abstractionLevel = -1,
      minCorrelation = 0.7,
      maxDaysInBetween = 1,
      heritageDriftAllowed = FALSE
    )
  )

  # Keep only the final selected concepts before exporting.
  result <- trimFinalStudy(result)

  # Save a temporary patient-mode study only so that precomputeSummary() can
  # generate the final summary-mode folder.
  CohortContrast:::saveResult(result, scratchRoot)

  CohortContrast::precomputeSummary(
    studyPath = file.path(scratchRoot, studyName),
    outputPath = file.path(summaryRoot, studyName),
    clusterKValues = c(2, 3, 4, 5)
  )

  # Remove the temporary patient-mode study folder and keep only the summary.
  unlink(file.path(scratchRoot, studyName), recursive = TRUE, force = TRUE)

  invisible(NULL)
}
```

## Run the full atlas generation loop

```{r, eval = FALSE}
# Directory that will keep only the summary-mode outputs.
summaryRoot <- "icd10_summary_atlas"

# Temporary directory used only while precomputeSummary() is running.
scratchRoot <- file.path(tempdir(), "cohortcontrast_icd10_scratch")

dir.create(summaryRoot, recursive = TRUE, showWarnings = FALSE)
dir.create(scratchRoot, recursive = TRUE, showWarnings = FALSE)

conn <- connectAtlasCdm()
db <- conn$db
cdm <- conn$cdm

eligibleCodes <- getEligibleIcd10Codes(cdm = cdm, minPatients = 100L)

for (i in seq_len(nrow(eligibleCodes))) {
  icd10code <- eligibleCodes$icd10code[i]
  longDesc <- eligibleCodes$long_desc[i]

  for (j in seq_len(nrow(windows))) {
    runIcd10AtlasCase(
      cdm = cdm,
      icd10code = icd10code,
      longDesc = longDesc,
      windowConfig = windows[j, ],
      summaryRoot = summaryRoot,
      scratchRoot = scratchRoot
    )
  }
}

DBI::dbDisconnect(db)
```

## Resulting output folders

After the workflow completes, the output directory contains one summary-mode
study folder per ICD10 code and per observation window.

The final retained studies:

- respect the three configured observation periods
- use visit-aligned control windows to reduce encounter-density bias
- include only the concepts that survived the statistical and post-processing
  pipeline
- keep only the summary outputs intended for atlas-style browsing