library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3.9000     v purrr   0.3.4     
## v tibble  3.1.2          v dplyr   1.0.6     
## v tidyr   1.1.3          v stringr 1.4.0     
## v readr   1.4.0          v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(labelled)
library(questionr)
library(gtsummary)
library(viridis)
## Le chargement a nécessité le package : viridisLite
library(scales)
## 
## Attachement du package : 'scales'
## L'objet suivant est masqué depuis 'package:viridis':
## 
##     viridis_pal
## L'objet suivant est masqué depuis 'package:purrr':
## 
##     discard
## L'objet suivant est masqué depuis 'package:readr':
## 
##     col_factor
load("care_trajectories.RData")
care_trajectories <- as_tibble(care_trajectories)
var_label(care_trajectories$sex) <- "Sexe"
val_labels(care_trajectories$sex) <- c(homme = 0, femme = 1)
var_label(care_trajectories$age) <- "Âge"
var_label(care_trajectories$education) <- "Education"
val_labels(care_trajectories$education) <- c(
  primaire = 1,
  secondaire = 2,
  supérieur = 3
)
val_labels(care_trajectories$care_status) <- c(
  "diagnostiqué, mais pas suivi" = "D",
  "suivi, mais pas sous traitement" = "C",
  "sous traitement, mais infection non contrôlée" = "T",
  "sous traitement et infection contrôlée" = "S"
)
care_trajectories <- care_trajectories %>%
  set_variable_labels(
    id = "Identifiant Patient",
    month = "Mois depuis la diagnostic",
    care_status = "Statut dans les soins",
    wealth = "Niveau de richesse",
    distance_clinic = "Distance à la clinique la plus proche"
  ) %>%
  set_value_labels(
    wealth = c(bas = 1, moyen = 2, haut = 3),
    distance_clinic = c("moins de 10 km" = 1, "10 km ou plus" = 2)
  )

Première description des données

care_trajectories %>%
  unlabelled() %>%
  tbl_summary()
Characteristic N = 49,3651
Identifiant Patient 4,961 (2,481, 7,532)
Mois depuis la diagnostic 9 (4, 16)
Statut dans les soins
diagnostiqué, mais pas suivi 25,374 (51%)
suivi, mais pas sous traitement 5,886 (12%)
sous traitement, mais infection non contrôlée 4,596 (9.3%)
sous traitement et infection contrôlée 13,509 (27%)
Sexe
homme 17,781 (36%)
femme 31,584 (64%)
Âge
16-29 16,911 (34%)
30-59 29,365 (59%)
60+ 3,089 (6.3%)
Education
primaire 10,417 (21%)
secondaire 19,024 (39%)
supérieur 19,924 (40%)
Niveau de richesse
bas 15,432 (31%)
moyen 20,769 (42%)
haut 13,164 (27%)
Distance à la clinique la plus proche
moins de 10 km 26,804 (54%)
10 km ou plus 22,561 (46%)

1 Median (IQR); n (%)

Le fichier contient en tout 49365 lignes (une par individu et par mois de suivi). Il correspond à 2929 individus différents.

n <- care_trajectories %>%
  filter(month %in% (0:8*6)) %>%
  group_by(month) %>%
  count() %>%
  pluck("n")

etiquettes <- paste0("M", 0:8*6, "\n(n=", n, ")")

ggplot(care_trajectories) +
  aes(x = month, fill = to_factor(care_status)) +
  geom_bar(width = 1, color = "gray50") +
  scale_fill_viridis(discrete = TRUE, direction = -1) +
  xlab("") + ylab("") +
  scale_x_continuous(breaks = 0:8*6, labels = etiquettes) +
  labs(fill = "Statut dans les soins") +
  ggtitle("Distribution du statut dans les soins chaque mois") +
  guides(fill = guide_legend(nrow = 2)) +
  theme_light() +
  theme(legend.position = "bottom")

ggplot(care_trajectories %>% filter(month <= 36)) +
  aes(x = month, fill = to_factor(care_status)) +
  geom_bar(width = 1, color = "gray50", position = "fill") +
  scale_fill_viridis(discrete = TRUE, direction = -1) +
  xlab("") + ylab("") +
  scale_x_continuous(breaks = 0:8*6, labels = etiquettes) +
  scale_y_continuous(labels = percent, breaks = 0:5/5) +
  labs(fill = "Statut dans les soins") +
  ggtitle("Cascade des soins observée, selon le temps depuis le diagnostic") +
  guides(fill = guide_legend(nrow = 2)) +
  theme_light() +
  theme(legend.position = "bottom")

Analyse de survie classique

ind <- care_trajectories %>% filter(month == 0)
ind$diagnostic <- 0

ind <- ind %>%
  left_join(
    care_trajectories %>%
      filter(care_status %in% c("C", "T", "S")) %>%
      group_by(id) %>%
      summarise(entree_soins = min(month)),
    by = "id"
  ) %>%
  left_join(
    care_trajectories %>%
      filter(care_status %in% c("T", "S")) %>%
      group_by(id) %>%
      summarise(initiation_tt = min(month)),
    by = "id"
  ) %>%
  left_join(
    care_trajectories %>%
      filter(care_status == "S") %>%
      group_by(id) %>%
      summarise(controle = min(month)),
    by = "id"
  ) %>%
  left_join(
    care_trajectories %>%
      group_by(id) %>%
      summarise(suivi = max(month)),
    by = "id"
  )

Un premier exemple : temps entre l’entrée en soins et l’initiation d’un traitement

library(survival)
library(broom)

tmp <- ind %>% filter(!is.na(entree_soins))

tmp$event <- FALSE
tmp$time <- tmp$suivi - tmp$entree_soins

tmp$event <- !is.na(tmp$initiation_tt)
tmp$time[tmp$event] <- tmp$initiation_tt[tmp$event] - tmp$entree_soins[tmp$event]


kaplan <- survfit(Surv(time, event) ~ 1 , data = tmp)
res <- tidy(kaplan, conf.int = TRUE)


ggplot(res) +
  aes(x = time, y = estimate) +
  geom_line()

Temps depuis le diagnostic

km <- function(date_origine, date_evenement, nom = ""){
  library(survival)
  library(broom)
  
  tmp <- ind 
  tmp <- tmp %>% filter(!is.na(tmp[[date_origine]]))
  
  tmp$event <- FALSE
  tmp$time <- tmp$suivi - tmp[[date_origine]]
  
  tmp$event <- !is.na(tmp[[date_evenement]])
  tmp$time[tmp$event] <- tmp[[date_evenement]][tmp$event] - tmp[[date_origine]][tmp$event]
  
  
  kaplan <- survfit(Surv(time, event) ~ 1 , data = tmp)
  res <- tidy(kaplan, conf.int = TRUE)
  res$nom <- nom
  
  res
}
depuis_diag <- bind_rows(
  km("diagnostic", "entree_soins", "Entrée en soins"),
  km("diagnostic", "initiation_tt", "Initiation du traitement"),
  km("diagnostic", "controle", "Contrôle de l'infection")
)
depuis_diag$nom <- fct_inorder(depuis_diag$nom)
ggplot(depuis_diag) +
  aes(
    x = time, y = 1 - estimate, colour = nom,
    ymin = 1 - conf.high, ymax = 1 - conf.low, fill = nom
  ) +
  geom_ribbon(alpha = .25, mapping = aes(colour = NULL)) +
  geom_line() +
  scale_x_continuous(limits = c(0, 36), breaks = 0:6*6) +
  scale_y_continuous(labels = scales::percent) +
  xlab("mois depuis le diagnostic") +
  ylab("") + labs(color = "", fill = "") +
  theme_classic() +
  theme(
    legend.position = "bottom",
    panel.grid.major.y = element_line(colour = "grey")
  )
## Warning: Removed 31 row(s) containing missing values (geom_path).

depuis_prec <- bind_rows(
  km("diagnostic", "entree_soins", "Entrée en soins"),
  km("entree_soins", "initiation_tt", "Initiation du traitement"),
  km("initiation_tt", "controle", "Contrôle de l'infection")
)
depuis_prec$nom <- fct_inorder(depuis_prec$nom)
ggplot(depuis_prec) +
  aes(
    x = time, y = 1 - estimate, colour = nom,
    ymin = 1 - conf.high, ymax = 1 - conf.low, fill = nom
  ) +
  geom_ribbon(alpha = .25, mapping = aes(colour = NULL)) +
  geom_line() +
  scale_x_continuous(limits = c(0, 36), breaks = 0:6*6) +
  scale_y_continuous(labels = scales::percent) +
  xlab("mois depuis l'étape précédente") +
  ylab("") + labs(color = "", fill = "") +
  theme_classic() +
  theme(
    legend.position = "bottom",
    panel.grid.major.y = element_line(colour = "grey")
  ) +
  facet_grid(cols = vars(nom))
## Warning: Removed 13 row(s) containing missing values (geom_path).

Selon le sexe

km_sexe <- function(date_origine, date_evenement, nom = ""){
  library(survival)
  library(broom)
  
  tmp <- ind 
  tmp <- tmp %>% filter(!is.na(tmp[[date_origine]]))
  
  tmp$event <- FALSE
  tmp$time <- tmp$suivi - tmp[[date_origine]]
  
  tmp$event <- !is.na(tmp[[date_evenement]])
  tmp$time[tmp$event] <- tmp[[date_evenement]][tmp$event] - tmp[[date_origine]][tmp$event]
  
  tmp$sexe <- to_factor(tmp$sex)
  
  
  kaplan <- survfit(Surv(time, event) ~ sexe , data = tmp)
  res <- tidy(kaplan, conf.int = TRUE)
  res$nom <- nom
  
  res
}

depuis_prec_sexe <- bind_rows(
  km_sexe("diagnostic", "entree_soins", "Entrée en soins"),
  km_sexe("entree_soins", "initiation_tt", "Initiation du traitement"),
  km_sexe("initiation_tt", "controle", "Contrôle de l'infection")
)
depuis_prec_sexe$nom <- fct_inorder(depuis_prec_sexe$nom)
depuis_prec_sexe$sexe <- depuis_prec_sexe$strata %>%
  str_sub(start = 6)
ggplot(depuis_prec_sexe) +
  aes(
    x = time, y = 1 - estimate, colour = sexe,
    ymin = 1 - conf.high, ymax = 1 - conf.low, fill = sexe
  ) +
  geom_ribbon(alpha = .25, mapping = aes(colour = NULL)) +
  geom_line() +
  scale_x_continuous(limits = c(0, 36), breaks = 0:6*6) +
  scale_y_continuous(labels = scales::percent) +
  xlab("mois depuis l'étape précédente") +
  ylab("") + labs(color = "", fill = "") +
  theme_classic() +
  theme(
    legend.position = "bottom",
    panel.grid.major.y = element_line(colour = "grey")
  ) +
  facet_grid(cols = vars(nom))

Analyse de séquences sur l’ensemble du fichier

large <- care_trajectories %>%
  select(id, month, care_status) %>%
  pivot_wider(names_from = month, values_from = care_status, names_prefix = "m")

library(TraMineR, quietly = TRUE)
## 
## TraMineR stable version 2.2-1 (Built: 2020-11-02)
## Website: http://traminer.unige.ch
## Please type 'citation("TraMineR")' for citation information.
seq_all <- seqdef(
  large %>% select(-id),
  id = large$id,
  alphabet = c("D", "C", "T", "S"),
  states = c("diagnostiqué", "en soins", "sous traitement", "controlé"),
  cpal = viridis(4, direction = -1)
)
##  [>] found missing values ('NA') in sequence data
##  [>] preparing 2929 sequences
##  [>] coding void elements with '%' and missing values with '*'
##  [>] state coding:
##        [alphabet]  [label]         [long label]
##      1  D           diagnostiqué    diagnostiqué
##      2  C           en soins        en soins
##      3  T           sous traitement sous traitement
##      4  S           controlé        controlé
##  [>] 2929 sequences in the data set
##  [>] min/max sequence length: 1/51
seqdplot(seq_all, legend.prop = .25)

couts <- seqcost(seq_all, method = "CONSTANT")
##  [>] creating 4x4 substitution-cost matrix using 2 as constant value
couts$sm[1,] <- c(0, 1, 2, 3)
couts$sm[2,] <- c(1, 0, 1, 2)
couts$sm[3,] <- c(2, 1, 0, 1)
couts$sm[4,] <- c(3, 2, 1, 0)
couts$indel <- max(couts$sm)/2

couts
## $indel
## [1] 1.5
## 
## $sm
##                   diagnostiqué-> en soins-> sous traitement-> controlé->
## diagnostiqué->                 0          1                 2          3
## en soins->                     1          0                 1          2
## sous traitement->              2          1                 0          1
## controlé->                     3          2                 1          0
dist_all <- seqdist(seq_all, method = "OM", sm = couts$sm, indel = couts$indel)
##  [>] 2929 sequences with 4 distinct states
##  [>] checking 'sm' (size and triangle inequality)
##  [>] 1370 distinct  sequences
##  [>] min/max sequence lengths: 1/51
##  [>] computing distances using the OM metric
##  [>] elapsed time: 2.07 secs
arbre_all <- hclust(as.dist(dist_all), method = "ward.D2")
plot(arbre_all)

library(seqhandbook)
seq_heatmap(seq_all, arbre_all)

Analyse de séquences sur 18 mois

large$seq_length <- seqlength(seq_all)
large_m18 <- large %>%
  filter(seq_length >= 19) %>%
  select(id, m0:m18)

seq_m18 <- seqdef(
  large_m18 %>% select(-id),
  id = large_m18$id,
  alphabet = c("D", "C", "T", "S"),
  states = c("diagnostiqué", "en soins", "sous traitement", "controlé"),
  cpal = viridis(4, direction = -1)
)
##  [>] state coding:
##        [alphabet]  [label]         [long label]
##      1  D           diagnostiqué    diagnostiqué
##      2  C           en soins        en soins
##      3  T           sous traitement sous traitement
##      4  S           controlé        controlé
##  [>] 1317 sequences in the data set
##  [>] min/max sequence length: 19/19
seqdplot(seq_m18, legend.prop = .25)

dist_m18 <- seqdist(seq_m18, method = "OM", sm = couts$sm, indel = couts$indel)
##  [>] 1317 sequences with 4 distinct states
##  [>] checking 'sm' (size and triangle inequality)
##  [>] 578 distinct  sequences
##  [>] min/max sequence lengths: 19/19
##  [>] computing distances using the OM metric
##  [>] elapsed time: 0.31 secs
arbre_m18 <- hclust(as.dist(dist_m18), method = "ward.D2")
plot(arbre_m18)

seq_heatmap(seq_m18, arbre_m18)

library(WeightedCluster, quietly = TRUE)
## This is WeightedCluster stable version 1.4-1 (Built: 2020-07-07)
## 
## To get the manuals, please run:
##    vignette("WeightedCluster") ## Complete manual in English
##    vignette("WeightedClusterFR") ## Complete manual in French
##    vignette("WeightedClusterPreview") ## Short preview in English
## 
## To cite WeightedCluster in publications please use:
## Studer, Matthias (2013). WeightedCluster Library Manual: A practical
##    guide to creating typologies of trajectories in the social sciences
##    with R. LIVES Working Papers, 24. doi:
##    10.12682/lives.2296-1658.2013.24
seqtree_m18 <- as.seqtree(arbre_m18, seqdata = seq_m18, diss = dist_m18, ncluster = 7)
seqtreedisplay(seqtree_m18, type = "I", filename = "seqtree_m18.png", show.depth = TRUE)
##  [>] GraphViz version 2.38 found.

mon seqtree

large_m18$typo_cah <- cutree(arbre_m18, k = 4)
pam_m18 <- wcKMedoids(dist_m18, k = 4, initialclust = arbre_m18)
large_m18$typo_pam <- pam_m18$clustering

library(gtsummary, quietly = TRUE)
large_m18 %>%
  tbl_cross(row = typo_cah, col = typo_pam)
Characteristic typo_pam Total
6 23 85 410
typo_cah
1 522 4 0 39 565
2 0 3 330 3 336
3 1 203 59 13 276
4 23 0 5 112 140
Total 546 210 394 167 1,317
large_m18$ordre_cmd <- cmdscale(as.dist(dist_m18), k = 1)
seqIplot(seq_m18, group = large_m18$typo_cah)

seqIplot(seq_m18, group = large_m18$typo_cah, sortv = large_m18$ordre_cmd)

seqIplot(seq_m18, group = large_m18$typo_pam, sortv = large_m18$ordre_cmd)

tab <- tibble(
  indicateur = names(wcClusterQuality(dist_m18, large_m18$typo_cah)$stats),
  cah = wcClusterQuality(dist_m18, large_m18$typo_cah)$stats,
  pam = wcClusterQuality(dist_m18, large_m18$typo_pam)$stats
)
tab %>% gt::gt() %>% gt::fmt_number(2:3, decimals = 2, sep_mark = " ", dec_mark = ",")
indicateur cah pam
PBC 0,70 0,71
HG 0,93 0,95
HGSD 0,93 0,95
ASW 0,53 0,57
ASWw 0,53 0,57
CH 938,97 1 023,12
R2 0,68 0,70
CHsq 3 351,21 3 980,60
R2sq 0,88 0,90
HC 0,03 0,02
large_m18$groupe <- as.character(large_m18$typo_pam)
large_m18$groupe <- fct_recode(large_m18$groupe,
  "Hors Soins" = "6",
  "Lents" = "23",
  "Rapides" = "85",
  "Inaboutis" = "410"
)
large_m18$groupe <- fct_relevel(
  large_m18$groupe,
  "Rapides", "Lents", "Inaboutis", "Hors Soins"
)

large_m18 <- large_m18 %>%
  mutate(rang_cmd = rank(ordre_cmd, ties.method = "first"))

long_m18 <- large_m18 %>% 
  pivot_longer(m0:m18, names_to = "month", values_to = "care_status", names_prefix = "m")
  
long_m18$month <- as.integer(long_m18$month)
long_m18$care_statusF <- to_factor(long_m18$care_status)

ggplot(long_m18) +
  aes(x = month, y = factor(rang_cmd), fill = care_statusF) +
  geom_raster() +
  scale_fill_viridis(discrete = TRUE, direction = -1) +
  facet_grid(rows = vars(groupe), scales = "free", space = "free") +
  labs(x = NULL, y = NULL, fill = NULL) +
  scale_y_discrete(labels = NULL) +
  scale_x_continuous(
    expand = expansion(0, 0), 
    breaks = 0:3*6,
    labels = paste0("M", 0:3*6)
  ) +
  guides(
    fill = guide_legend(ncol = 2)
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom"
  )

Facteurs associés à chaque groupe

large_m18 <- large_m18 %>%
  left_join(
    care_trajectories %>%
      filter(month == 0) %>%
      select(id, sex, age, education),
    by = "id"
  )
library(gtsummary)

theme_gtsummary_language("fr", decimal.mark = ",", big.mark = " ")
## Setting theme `language: fr`
large_m18 %>%
  select(groupe, age, sex, education) %>%
  unlabelled() %>%
  tbl_summary(
    by = "groupe", 
    digits = all_categorical() ~ c(0, 1)
  ) %>%
  add_overall(last = TRUE) %>%
  add_p()
Caractéristique Rapides, N = 3941 Lents, N = 2101 Inaboutis, N = 1671 Hors Soins, N = 5461 Total, N = 1 3171 p-valeur2
Âge <0,001
16-29 96 (24,4%) 71 (33,8%) 62 (37,1%) 235 (43,0%) 464 (35,2%)
30-59 269 (68,3%) 128 (61,0%) 91 (54,5%) 281 (51,5%) 769 (58,4%)
60+ 29 (7,4%) 11 (5,2%) 14 (8,4%) 30 (5,5%) 84 (6,4%)
Sexe <0,001
homme 104 (26,4%) 61 (29,0%) 42 (25,1%) 243 (44,5%) 450 (34,2%)
femme 290 (73,6%) 149 (71,0%) 125 (74,9%) 303 (55,5%) 867 (65,8%)
Education 0,002
primaire 63 (16,0%) 47 (22,4%) 30 (18,0%) 123 (22,5%) 263 (20,0%)
secondaire 126 (32,0%) 83 (39,5%) 61 (36,5%) 212 (38,8%) 482 (36,6%)
supérieur 205 (52,0%) 80 (38,1%) 76 (45,5%) 211 (38,6%) 572 (43,4%)

1 n (%)

2 test du khi-deux d'indépendance

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
large_m18 %>%
  unlabelled() %>%
  ggbivariate(
    outcome = "groupe",
    explanatory = c("age", "sex", "education")
  )

large_m18 %>%
  unlabelled() %>%
  ggtable(
    columnsX = "groupe",
    columnsY = c("age", "sex", "education"),
    cells = "col.prop",
    fill = "std.resid",
    legend = 1
  )

library(nnet)
large_m18$groupe2 <- large_m18$groupe %>%
  fct_relevel("Hors Soins")
regm <- multinom(
  groupe2 ~ sex + age + education,
  data = unlabelled(large_m18)
)
## # weights:  28 (18 variable)
## initial  value 1825.749674 
## iter  10 value 1648.561757
## iter  20 value 1638.851444
## final  value 1638.811514 
## converged
regm %>%
  tbl_regression()
## i Multinomial models have a different underlying structure than the models
## gtsummary was designed for. Other gtsummary functions designed to work with
## tbl_regression objects may yield unexpected results.
Caractéristique log(OR)1 95% CI1 p-valeur
Rapides
Sexe
homme — —
femme 0,88 0,59 – 1,2 <0,001
Âge
16-29 — —
30-59 0,82 0,50 – 1,1 <0,001
60+ 0,79 0,16 – 1,4 0,013
Education
primaire — —
secondaire 0,18 -0,21 – 0,56 0,4
supérieur 0,35 -0,04 – 0,75 0,080
Lents
Sexe
homme — —
femme 0,72 0,37 – 1,1 <0,001
Âge
16-29 — —
30-59 0,56 0,19 – 0,93 0,003
60+ 0,48 -0,33 – 1,3 0,2
Education
primaire — —
secondaire 0,05 -0,38 – 0,48 0,8
supérieur -0,20 -0,66 – 0,26 0,4
Inaboutis
Sexe
homme — —
femme 0,91 0,52 – 1,3 <0,001
Âge
16-29 — —
30-59 0,20 -0,21 – 0,61 0,3
60+ 0,55 -0,22 – 1,3 0,2
Education
primaire — —
secondaire 0,20 -0,30 – 0,69 0,4
supérieur 0,30 -0,23 – 0,82 0,3

1 OR = rapport de cotes, CI = intervalle de confiance

regm %>%
  ggcoef_multinom(exponentiate = TRUE)

regm %>%
  ggcoef_multinom(exponentiate = TRUE, type = "f")

library(ggeffects)
ggeffect(regm) %>%
  plot() %>%
  cowplot::plot_grid(plotlist = ., ncol = 3)