Trajectoires de soins

Première description des données
Analyse de survie classique
Analyse de séquences sur l’ensemble du fichier
Analyse de séquences sur 18 mois
Facteurs associés à chaque groupe

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.3.9000     v purrr   0.3.4     
## v tibble  3.1.2          v dplyr   1.0.6     
## v tidyr   1.1.3          v stringr 1.4.0     
## v readr   1.4.0          v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(labelled)
library(questionr)
library(gtsummary)
library(viridis)

## Le chargement a nécessité le package : viridisLite

library(scales)

## 
## Attachement du package : 'scales'

## L'objet suivant est masqué depuis 'package:viridis':
## 
##     viridis_pal

## L'objet suivant est masqué depuis 'package:purrr':
## 
##     discard

## L'objet suivant est masqué depuis 'package:readr':
## 
##     col_factor

load("care_trajectories.RData")
care_trajectories <- as_tibble(care_trajectories)
var_label(care_trajectories$sex) <- "Sexe"
val_labels(care_trajectories$sex) <- c(homme = 0, femme = 1)
var_label(care_trajectories$age) <- "Âge"
var_label(care_trajectories$education) <- "Education"
val_labels(care_trajectories$education) <- c(
  primaire = 1,
  secondaire = 2,
  supérieur = 3
)
val_labels(care_trajectories$care_status) <- c(
  "diagnostiqué, mais pas suivi" = "D",
  "suivi, mais pas sous traitement" = "C",
  "sous traitement, mais infection non contrôlée" = "T",
  "sous traitement et infection contrôlée" = "S"
)
care_trajectories <- care_trajectories %>%
  set_variable_labels(
    id = "Identifiant Patient",
    month = "Mois depuis la diagnostic",
    care_status = "Statut dans les soins",
    wealth = "Niveau de richesse",
    distance_clinic = "Distance à la clinique la plus proche"
  ) %>%
  set_value_labels(
    wealth = c(bas = 1, moyen = 2, haut = 3),
    distance_clinic = c("moins de 10 km" = 1, "10 km ou plus" = 2)
  )

Première description des données

care_trajectories %>%
  unlabelled() %>%
  tbl_summary()

Characteristic	N = 49,365¹
Identifiant Patient	4,961 (2,481, 7,532)
Mois depuis la diagnostic	9 (4, 16)
Statut dans les soins
diagnostiqué, mais pas suivi	25,374 (51%)
suivi, mais pas sous traitement	5,886 (12%)
sous traitement, mais infection non contrôlée	4,596 (9.3%)
sous traitement et infection contrôlée	13,509 (27%)
Sexe
homme	17,781 (36%)
femme	31,584 (64%)
Âge
16-29	16,911 (34%)
30-59	29,365 (59%)
60+	3,089 (6.3%)
Education
primaire	10,417 (21%)
secondaire	19,024 (39%)
supérieur	19,924 (40%)
Niveau de richesse
bas	15,432 (31%)
moyen	20,769 (42%)
haut	13,164 (27%)
Distance à la clinique la plus proche
moins de 10 km	26,804 (54%)
10 km ou plus	22,561 (46%)
¹ Median (IQR); n (%)

Le fichier contient en tout 49365 lignes (une par individu et par mois de suivi). Il correspond à 2929 individus différents.

n <- care_trajectories %>%
  filter(month %in% (0:8*6)) %>%
  group_by(month) %>%
  count() %>%
  pluck("n")

etiquettes <- paste0("M", 0:8*6, "\n(n=", n, ")")

ggplot(care_trajectories) +
  aes(x = month, fill = to_factor(care_status)) +
  geom_bar(width = 1, color = "gray50") +
  scale_fill_viridis(discrete = TRUE, direction = -1) +
  xlab("") + ylab("") +
  scale_x_continuous(breaks = 0:8*6, labels = etiquettes) +
  labs(fill = "Statut dans les soins") +
  ggtitle("Distribution du statut dans les soins chaque mois") +
  guides(fill = guide_legend(nrow = 2)) +
  theme_light() +
  theme(legend.position = "bottom")

ggplot(care_trajectories %>% filter(month <= 36)) +
  aes(x = month, fill = to_factor(care_status)) +
  geom_bar(width = 1, color = "gray50", position = "fill") +
  scale_fill_viridis(discrete = TRUE, direction = -1) +
  xlab("") + ylab("") +
  scale_x_continuous(breaks = 0:8*6, labels = etiquettes) +
  scale_y_continuous(labels = percent, breaks = 0:5/5) +
  labs(fill = "Statut dans les soins") +
  ggtitle("Cascade des soins observée, selon le temps depuis le diagnostic") +
  guides(fill = guide_legend(nrow = 2)) +
  theme_light() +
  theme(legend.position = "bottom")

Analyse de survie classique

ind <- care_trajectories %>% filter(month == 0)
ind$diagnostic <- 0

ind <- ind %>%
  left_join(
    care_trajectories %>%
      filter(care_status %in% c("C", "T", "S")) %>%
      group_by(id) %>%
      summarise(entree_soins = min(month)),
    by = "id"
  ) %>%
  left_join(
    care_trajectories %>%
      filter(care_status %in% c("T", "S")) %>%
      group_by(id) %>%
      summarise(initiation_tt = min(month)),
    by = "id"
  ) %>%
  left_join(
    care_trajectories %>%
      filter(care_status == "S") %>%
      group_by(id) %>%
      summarise(controle = min(month)),
    by = "id"
  ) %>%
  left_join(
    care_trajectories %>%
      group_by(id) %>%
      summarise(suivi = max(month)),
    by = "id"
  )

Un premier exemple : temps entre l’entrée en soins et l’initiation d’un traitement

library(survival)
library(broom)

tmp <- ind %>% filter(!is.na(entree_soins))

tmp$event <- FALSE
tmp$time <- tmp$suivi - tmp$entree_soins

tmp$event <- !is.na(tmp$initiation_tt)
tmp$time[tmp$event] <- tmp$initiation_tt[tmp$event] - tmp$entree_soins[tmp$event]


kaplan <- survfit(Surv(time, event) ~ 1 , data = tmp)
res <- tidy(kaplan, conf.int = TRUE)


ggplot(res) +
  aes(x = time, y = estimate) +
  geom_line()

Temps depuis le diagnostic

km <- function(date_origine, date_evenement, nom = ""){
  library(survival)
  library(broom)
  
  tmp <- ind 
  tmp <- tmp %>% filter(!is.na(tmp[[date_origine]]))
  
  tmp$event <- FALSE
  tmp$time <- tmp$suivi - tmp[[date_origine]]
  
  tmp$event <- !is.na(tmp[[date_evenement]])
  tmp$time[tmp$event] <- tmp[[date_evenement]][tmp$event] - tmp[[date_origine]][tmp$event]
  
  
  kaplan <- survfit(Surv(time, event) ~ 1 , data = tmp)
  res <- tidy(kaplan, conf.int = TRUE)
  res$nom <- nom
  
  res
}

depuis_diag <- bind_rows(
  km("diagnostic", "entree_soins", "Entrée en soins"),
  km("diagnostic", "initiation_tt", "Initiation du traitement"),
  km("diagnostic", "controle", "Contrôle de l'infection")
)
depuis_diag$nom <- fct_inorder(depuis_diag$nom)

ggplot(depuis_diag) +
  aes(
    x = time, y = 1 - estimate, colour = nom,
    ymin = 1 - conf.high, ymax = 1 - conf.low, fill = nom
  ) +
  geom_ribbon(alpha = .25, mapping = aes(colour = NULL)) +
  geom_line() +
  scale_x_continuous(limits = c(0, 36), breaks = 0:6*6) +
  scale_y_continuous(labels = scales::percent) +
  xlab("mois depuis le diagnostic") +
  ylab("") + labs(color = "", fill = "") +
  theme_classic() +
  theme(
    legend.position = "bottom",
    panel.grid.major.y = element_line(colour = "grey")
  )

## Warning: Removed 31 row(s) containing missing values (geom_path).

depuis_prec <- bind_rows(
  km("diagnostic", "entree_soins", "Entrée en soins"),
  km("entree_soins", "initiation_tt", "Initiation du traitement"),
  km("initiation_tt", "controle", "Contrôle de l'infection")
)
depuis_prec$nom <- fct_inorder(depuis_prec$nom)

ggplot(depuis_prec) +
  aes(
    x = time, y = 1 - estimate, colour = nom,
    ymin = 1 - conf.high, ymax = 1 - conf.low, fill = nom
  ) +
  geom_ribbon(alpha = .25, mapping = aes(colour = NULL)) +
  geom_line() +
  scale_x_continuous(limits = c(0, 36), breaks = 0:6*6) +
  scale_y_continuous(labels = scales::percent) +
  xlab("mois depuis l'étape précédente") +
  ylab("") + labs(color = "", fill = "") +
  theme_classic() +
  theme(
    legend.position = "bottom",
    panel.grid.major.y = element_line(colour = "grey")
  ) +
  facet_grid(cols = vars(nom))

## Warning: Removed 13 row(s) containing missing values (geom_path).

Selon le sexe

km_sexe <- function(date_origine, date_evenement, nom = ""){
  library(survival)
  library(broom)
  
  tmp <- ind 
  tmp <- tmp %>% filter(!is.na(tmp[[date_origine]]))
  
  tmp$event <- FALSE
  tmp$time <- tmp$suivi - tmp[[date_origine]]
  
  tmp$event <- !is.na(tmp[[date_evenement]])
  tmp$time[tmp$event] <- tmp[[date_evenement]][tmp$event] - tmp[[date_origine]][tmp$event]
  
  tmp$sexe <- to_factor(tmp$sex)
  
  
  kaplan <- survfit(Surv(time, event) ~ sexe , data = tmp)
  res <- tidy(kaplan, conf.int = TRUE)
  res$nom <- nom
  
  res
}

depuis_prec_sexe <- bind_rows(
  km_sexe("diagnostic", "entree_soins", "Entrée en soins"),
  km_sexe("entree_soins", "initiation_tt", "Initiation du traitement"),
  km_sexe("initiation_tt", "controle", "Contrôle de l'infection")
)
depuis_prec_sexe$nom <- fct_inorder(depuis_prec_sexe$nom)
depuis_prec_sexe$sexe <- depuis_prec_sexe$strata %>%
  str_sub(start = 6)

ggplot(depuis_prec_sexe) +
  aes(
    x = time, y = 1 - estimate, colour = sexe,
    ymin = 1 - conf.high, ymax = 1 - conf.low, fill = sexe
  ) +
  geom_ribbon(alpha = .25, mapping = aes(colour = NULL)) +
  geom_line() +
  scale_x_continuous(limits = c(0, 36), breaks = 0:6*6) +
  scale_y_continuous(labels = scales::percent) +
  xlab("mois depuis l'étape précédente") +
  ylab("") + labs(color = "", fill = "") +
  theme_classic() +
  theme(
    legend.position = "bottom",
    panel.grid.major.y = element_line(colour = "grey")
  ) +
  facet_grid(cols = vars(nom))

Analyse de séquences sur l’ensemble du fichier

large <- care_trajectories %>%
  select(id, month, care_status) %>%
  pivot_wider(names_from = month, values_from = care_status, names_prefix = "m")

library(TraMineR, quietly = TRUE)

## 
## TraMineR stable version 2.2-1 (Built: 2020-11-02)

## Website: http://traminer.unige.ch

## Please type 'citation("TraMineR")' for citation information.

seq_all <- seqdef(
  large %>% select(-id),
  id = large$id,
  alphabet = c("D", "C", "T", "S"),
  states = c("diagnostiqué", "en soins", "sous traitement", "controlé"),
  cpal = viridis(4, direction = -1)
)

##  [>] found missing values ('NA') in sequence data

##  [>] preparing 2929 sequences

##  [>] coding void elements with '%' and missing values with '*'

##  [>] state coding:

##        [alphabet]  [label]         [long label]

##      1  D           diagnostiqué    diagnostiqué

##      2  C           en soins        en soins

##      3  T           sous traitement sous traitement

##      4  S           controlé        controlé

##  [>] 2929 sequences in the data set

##  [>] min/max sequence length: 1/51

seqdplot(seq_all, legend.prop = .25)

couts <- seqcost(seq_all, method = "CONSTANT")

##  [>] creating 4x4 substitution-cost matrix using 2 as constant value

couts$sm[1,] <- c(0, 1, 2, 3)
couts$sm[2,] <- c(1, 0, 1, 2)
couts$sm[3,] <- c(2, 1, 0, 1)
couts$sm[4,] <- c(3, 2, 1, 0)
couts$indel <- max(couts$sm)/2

couts

## $indel
## [1] 1.5
## 
## $sm
##                   diagnostiqué-> en soins-> sous traitement-> controlé->
## diagnostiqué->                 0          1                 2          3
## en soins->                     1          0                 1          2
## sous traitement->              2          1                 0          1
## controlé->                     3          2                 1          0

dist_all <- seqdist(seq_all, method = "OM", sm = couts$sm, indel = couts$indel)

##  [>] 2929 sequences with 4 distinct states

##  [>] checking 'sm' (size and triangle inequality)

##  [>] 1370 distinct  sequences

##  [>] min/max sequence lengths: 1/51

##  [>] computing distances using the OM metric

##  [>] elapsed time: 2.07 secs

arbre_all <- hclust(as.dist(dist_all), method = "ward.D2")
plot(arbre_all)

library(seqhandbook)
seq_heatmap(seq_all, arbre_all)

Analyse de séquences sur 18 mois

large$seq_length <- seqlength(seq_all)
large_m18 <- large %>%
  filter(seq_length >= 19) %>%
  select(id, m0:m18)

seq_m18 <- seqdef(
  large_m18 %>% select(-id),
  id = large_m18$id,
  alphabet = c("D", "C", "T", "S"),
  states = c("diagnostiqué", "en soins", "sous traitement", "controlé"),
  cpal = viridis(4, direction = -1)
)

##  [>] state coding:

##        [alphabet]  [label]         [long label]

##      1  D           diagnostiqué    diagnostiqué

##      2  C           en soins        en soins

##      3  T           sous traitement sous traitement

##      4  S           controlé        controlé

##  [>] 1317 sequences in the data set

##  [>] min/max sequence length: 19/19

seqdplot(seq_m18, legend.prop = .25)

dist_m18 <- seqdist(seq_m18, method = "OM", sm = couts$sm, indel = couts$indel)

##  [>] 1317 sequences with 4 distinct states

##  [>] checking 'sm' (size and triangle inequality)

##  [>] 578 distinct  sequences

##  [>] min/max sequence lengths: 19/19

##  [>] computing distances using the OM metric

##  [>] elapsed time: 0.31 secs

arbre_m18 <- hclust(as.dist(dist_m18), method = "ward.D2")
plot(arbre_m18)

seq_heatmap(seq_m18, arbre_m18)

library(WeightedCluster, quietly = TRUE)

## This is WeightedCluster stable version 1.4-1 (Built: 2020-07-07)

## 
## To get the manuals, please run:

##    vignette("WeightedCluster") ## Complete manual in English

##    vignette("WeightedClusterFR") ## Complete manual in French

##    vignette("WeightedClusterPreview") ## Short preview in English

## 
## To cite WeightedCluster in publications please use:

## Studer, Matthias (2013). WeightedCluster Library Manual: A practical

##    guide to creating typologies of trajectories in the social sciences

##    with R. LIVES Working Papers, 24. doi:

##    10.12682/lives.2296-1658.2013.24

seqtree_m18 <- as.seqtree(arbre_m18, seqdata = seq_m18, diss = dist_m18, ncluster = 7)
seqtreedisplay(seqtree_m18, type = "I", filename = "seqtree_m18.png", show.depth = TRUE)

##  [>] GraphViz version 2.38 found.

mon seqtree

large_m18$typo_cah <- cutree(arbre_m18, k = 4)

pam_m18 <- wcKMedoids(dist_m18, k = 4, initialclust = arbre_m18)
large_m18$typo_pam <- pam_m18$clustering

library(gtsummary, quietly = TRUE)
large_m18 %>%
  tbl_cross(row = typo_cah, col = typo_pam)

Characteristic	typo_pam				Total
Characteristic	6	23	85	410	Total
typo_cah
1	522	4	0	39	565
2	0	3	330	3	336
3	1	203	59	13	276
4	23	0	5	112	140
Total	546	210	394	167	1,317

large_m18$ordre_cmd <- cmdscale(as.dist(dist_m18), k = 1)

seqIplot(seq_m18, group = large_m18$typo_cah)

seqIplot(seq_m18, group = large_m18$typo_cah, sortv = large_m18$ordre_cmd)

seqIplot(seq_m18, group = large_m18$typo_pam, sortv = large_m18$ordre_cmd)

tab <- tibble(
  indicateur = names(wcClusterQuality(dist_m18, large_m18$typo_cah)$stats),
  cah = wcClusterQuality(dist_m18, large_m18$typo_cah)$stats,
  pam = wcClusterQuality(dist_m18, large_m18$typo_pam)$stats
)
tab %>% gt::gt() %>% gt::fmt_number(2:3, decimals = 2, sep_mark = " ", dec_mark = ",")

indicateur	cah	pam
PBC	0,70	0,71
HG	0,93	0,95
HGSD	0,93	0,95
ASW	0,53	0,57
ASWw	0,53	0,57
CH	938,97	1 023,12
R2	0,68	0,70
CHsq	3 351,21	3 980,60
R2sq	0,88	0,90
HC	0,03	0,02

large_m18$groupe <- as.character(large_m18$typo_pam)
large_m18$groupe <- fct_recode(large_m18$groupe,
  "Hors Soins" = "6",
  "Lents" = "23",
  "Rapides" = "85",
  "Inaboutis" = "410"
)
large_m18$groupe <- fct_relevel(
  large_m18$groupe,
  "Rapides", "Lents", "Inaboutis", "Hors Soins"
)

large_m18 <- large_m18 %>%
  mutate(rang_cmd = rank(ordre_cmd, ties.method = "first"))

long_m18 <- large_m18 %>% 
  pivot_longer(m0:m18, names_to = "month", values_to = "care_status", names_prefix = "m")
  
long_m18$month <- as.integer(long_m18$month)
long_m18$care_statusF <- to_factor(long_m18$care_status)

ggplot(long_m18) +
  aes(x = month, y = factor(rang_cmd), fill = care_statusF) +
  geom_raster() +
  scale_fill_viridis(discrete = TRUE, direction = -1) +
  facet_grid(rows = vars(groupe), scales = "free", space = "free") +
  labs(x = NULL, y = NULL, fill = NULL) +
  scale_y_discrete(labels = NULL) +
  scale_x_continuous(
    expand = expansion(0, 0), 
    breaks = 0:3*6,
    labels = paste0("M", 0:3*6)
  ) +
  guides(
    fill = guide_legend(ncol = 2)
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom"
  )

Facteurs associés à chaque groupe

large_m18 <- large_m18 %>%
  left_join(
    care_trajectories %>%
      filter(month == 0) %>%
      select(id, sex, age, education),
    by = "id"
  )

library(gtsummary)

theme_gtsummary_language("fr", decimal.mark = ",", big.mark = " ")

## Setting theme `language: fr`

large_m18 %>%
  select(groupe, age, sex, education) %>%
  unlabelled() %>%
  tbl_summary(
    by = "groupe", 
    digits = all_categorical() ~ c(0, 1)
  ) %>%
  add_overall(last = TRUE) %>%
  add_p()

Caractéristique	Rapides, N = 394¹	Lents, N = 210¹	Inaboutis, N = 167¹	Hors Soins, N = 546¹	Total, N = 1 317¹	p-valeur²
Âge						<0,001
16-29	96 (24,4%)	71 (33,8%)	62 (37,1%)	235 (43,0%)	464 (35,2%)
30-59	269 (68,3%)	128 (61,0%)	91 (54,5%)	281 (51,5%)	769 (58,4%)
60+	29 (7,4%)	11 (5,2%)	14 (8,4%)	30 (5,5%)	84 (6,4%)
Sexe						<0,001
homme	104 (26,4%)	61 (29,0%)	42 (25,1%)	243 (44,5%)	450 (34,2%)
femme	290 (73,6%)	149 (71,0%)	125 (74,9%)	303 (55,5%)	867 (65,8%)
Education						0,002
primaire	63 (16,0%)	47 (22,4%)	30 (18,0%)	123 (22,5%)	263 (20,0%)
secondaire	126 (32,0%)	83 (39,5%)	61 (36,5%)	212 (38,8%)	482 (36,6%)
supérieur	205 (52,0%)	80 (38,1%)	76 (45,5%)	211 (38,6%)	572 (43,4%)
¹ n (%) ² test du khi-deux d'indépendance

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

large_m18 %>%
  unlabelled() %>%
  ggbivariate(
    outcome = "groupe",
    explanatory = c("age", "sex", "education")
  )

large_m18 %>%
  unlabelled() %>%
  ggtable(
    columnsX = "groupe",
    columnsY = c("age", "sex", "education"),
    cells = "col.prop",
    fill = "std.resid",
    legend = 1
  )

library(nnet)
large_m18$groupe2 <- large_m18$groupe %>%
  fct_relevel("Hors Soins")
regm <- multinom(
  groupe2 ~ sex + age + education,
  data = unlabelled(large_m18)
)

## # weights:  28 (18 variable)
## initial  value 1825.749674 
## iter  10 value 1648.561757
## iter  20 value 1638.851444
## final  value 1638.811514 
## converged

regm %>%
  tbl_regression()

## i Multinomial models have a different underlying structure than the models
## gtsummary was designed for. Other gtsummary functions designed to work with
## tbl_regression objects may yield unexpected results.

Caractéristique	log(OR)¹	95% CI¹	p-valeur
Rapides
Sexe
homme	—	—
femme	0,88	0,59 – 1,2	<0,001
Âge
16-29	—	—
30-59	0,82	0,50 – 1,1	<0,001
60+	0,79	0,16 – 1,4	0,013
Education
primaire	—	—
secondaire	0,18	-0,21 – 0,56	0,4
supérieur	0,35	-0,04 – 0,75	0,080
Lents
Sexe
homme	—	—
femme	0,72	0,37 – 1,1	<0,001
Âge
16-29	—	—
30-59	0,56	0,19 – 0,93	0,003
60+	0,48	-0,33 – 1,3	0,2
Education
primaire	—	—
secondaire	0,05	-0,38 – 0,48	0,8
supérieur	-0,20	-0,66 – 0,26	0,4
Inaboutis
Sexe
homme	—	—
femme	0,91	0,52 – 1,3	<0,001
Âge
16-29	—	—
30-59	0,20	-0,21 – 0,61	0,3
60+	0,55	-0,22 – 1,3	0,2
Education
primaire	—	—
secondaire	0,20	-0,30 – 0,69	0,4
supérieur	0,30	-0,23 – 0,82	0,3
¹ OR = rapport de cotes, CI = intervalle de confiance

regm %>%
  ggcoef_multinom(exponentiate = TRUE)

regm %>%
  ggcoef_multinom(exponentiate = TRUE, type = "f")

library(ggeffects)
ggeffect(regm) %>%
  plot() %>%
  cowplot::plot_grid(plotlist = ., ncol = 3)

Trajectoires de soins

Joseph Larmarange

17/06/2021

Première description des données

Analyse de survie classique

Un premier exemple : temps entre l’entrée en soins et l’initiation d’un traitement

Temps depuis le diagnostic

Selon le sexe

Analyse de séquences sur l’ensemble du fichier

Analyse de séquences sur 18 mois

Facteurs associés à chaque groupe