# ANALYSE DE SURVIE

# Chargement des données -----

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3.9000     v purrr   0.3.4     
## v tibble  3.1.1          v dplyr   1.0.5     
## v tidyr   1.1.3          v stringr 1.4.0     
## v readr   1.4.0          v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(labelled)
library(questionr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(survival)
library(survminer)
## Loading required package: ggpubr
## Warning: replacing previous import 'ggplot2:::=' by 'data.table:::=' when
## loading 'survMisc'
data(fecondite)

# Recodages -------

## durée d'observation

enfants <- enfants %>%
  left_join(
    femmes %>% select(id_femme, date_entretien),
    by = "id_femme"
  )

enfants <- enfants %>%
  mutate(
    duree_observation = interval(date_naissance, date_entretien) %>% 
      time_length(unit = "month")
  )

enfants <- enfants %>%
  mutate(
    date_entretien = date_entretien %>%
      recode_if(
        date_entretien < date_naissance,
        date_entretien %m+% months(1)
      ),
    duree_observation = interval(date_naissance, date_entretien) %>% 
      time_length(unit = "month"),
    duree_observation_mois_revolus = trunc(duree_observation)
  )

## age au décès imputation

enfants <- enfants %>%
  mutate(
    age_deces_impute = age_deces + runif(n())  
  )

## variables de survie

enfants <- enfants %>%
  mutate(
    deces = if_else(survie == 0, 1, 0),
    time = if_else(
      survie == 0,
      age_deces_impute,
      duree_observation
    ),
    time_revolus = if_else(
      survie == 0,
      age_deces,
      duree_observation_mois_revolus
    )
  ) %>%
  set_variable_labels(deces = "Est décédé ?") %>%
  set_value_labels(deces = c("en vie" = 0, "décédé" = 1))


## variables explicatives

enfants <- enfants %>%
  left_join(
    femmes %>% 
      select(id_femme, id_menage, milieu, educ, date_naissance_mere = date_naissance, nb_enf_ideal),
    by = "id_femme"
  ) %>%
  left_join(
    menages %>%
      select(id_menage, structure, richesse),
    by = "id_menage"
  )

enfants <- enfants %>%
  mutate(
    sexe = to_factor(sexe),
    richesse = to_factor(richesse),
    milieu = to_factor(milieu),
    structure = to_factor(structure) %>%
      fct_drop() %>%
      fct_relevel("deux adultes de sexe opposé"),
    educ2 = to_factor(educ) %>%
      fct_recode(
        "secondaire et sup" = "secondaire", 
        "secondaire et sup" = "supérieur"
      ),
    age_mere_naissance = interval(date_naissance_mere, date_naissance) %>%
      time_length(unit = "years")
  )

enfants$gp_age_mere_naissance <- cut(enfants$age_mere_naissance,
  include.lowest = TRUE,
  right = FALSE,
  breaks = c(-Inf, 20, 30, Inf)
)

enfants$gp_age_mere_naissance <- enfants$gp_age_mere_naissance %>%
  fct_recode(
    "19 ans ou moins" = "[-Inf,20)",
    "20-29 ans" = "[20,30)",
    "30 ans ou plus" = "[30, Inf]"
  )

enfants <- enfants %>%
  arrange(id_femme, date_naissance) %>%
  group_by(id_femme) %>%
  mutate(
    rang = rank(date_naissance, ties.method = "max"),
    nb_enf_ideal = nb_enf_ideal %>% user_na_to_na() %>% unclass(), # transformer les valeurs indiquées manquantes en NA
    rang_apres_ideal = if_else(rang > nb_enf_ideal, "oui", "non") %>%
      factor(levels = c("non", "oui"))
  ) %>%
  set_variable_labels(rang_apres_ideal = "Rang de naissance plus élevé qu'idéal")

# Courbe de Kaplan-Meier -----

km_globale <- survfit(Surv(time, deces) ~ 1, data = enfants)
ggsurvplot(km_globale)

ggsurvplot(km_globale, fun = "event", risk.table = TRUE, surv.scale = "percent", break.time.by = 12)

km_globale_revolus <- survfit(Surv(time_revolus, deces) ~ 1, data = enfants)
ggsurvplot(km_globale_revolus, fun = "event", risk.table = TRUE, surv.scale = "percent", break.time.by = 12)

km_sexe <- survfit(Surv(time, deces) ~ sexe, data = enfants)
survdiff(Surv(time, deces) ~ sexe, data = enfants)
## Call:
## survdiff(formula = Surv(time, deces) ~ sexe, data = enfants)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## sexe=masculin 762       94     66.3      11.6      21.8
## sexe=féminin  822       48     75.7      10.2      21.8
## 
##  Chisq= 21.8  on 1 degrees of freedom, p= 3e-06
ggsurvplot(km_sexe, fun = "event", risk.table = TRUE, surv.scale = "percent", break.time.by = 12, pval = TRUE)

km_sexe_age <- survfit(Surv(time, deces) ~ sexe + gp_age_mere_naissance, data = enfants)
ggsurvplot(km_sexe_age, fun = "event", risk.table = TRUE, surv.scale = "percent", break.time.by = 12, pval = TRUE)

## Modèle de Cox

mod1 <- coxph(
  Surv(time, deces) ~ sexe + milieu + richesse + 
    structure + educ2 + gp_age_mere_naissance + rang_apres_ideal,
  data = enfants
)

library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.0.5
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
tbl <- tbl_regression(mod1, exponentiate = TRUE)
tbl %>% 
  add_global_p(keep = TRUE) %>%
  add_significance_stars(hide_p = FALSE)
## add_global_p: Global p-values for variable(s) `add_global_p(include =
## c("sexe", "milieu", "richesse", "structure", "educ2", "gp_age_mere_naissance",
## "rang_apres_ideal"))` were calculated with
##   `car::Anova(x$model_obj, type = "III")`
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Characteristic HR1,2 SE2 p-value
Sexe de l'enfant <0.001
masculin — —
féminin 0.44*** 0.178 <0.001
Milieu de résidence 0.013
urbain — —
rural 1.92* 0.270 0.015
Niveau de vie (quintiles) 0.5
très pauvre — —
pauvre 0.93 0.251 0.8
moyen 1.38 0.248 0.2
riche 1.42 0.298 0.2
très riche 1.59 0.429 0.3
Structure démographique du ménage 0.6
deux adultes de sexe opposé — —
un adulte 0.87 0.600 0.8
deux adultes de même sexe 1.83 0.377 0.11
trois adultes ou plus avec lien de parenté 1.05 0.197 0.8
adultes sans lien de parenté 0.88 0.305 0.7
Niveau d'éducation 0.8
aucun — —
primaire 0.97 0.206 0.9
secondaire et sup 0.82 0.367 0.6
gp_age_mere_naissance 0.5
19 ans ou moins — —
20-29 ans 1.36 0.268 0.2
30 ans ou plus 1.36 0.291 0.3
Rang de naissance plus élevé qu'idéal 0.054
non — —
oui 4.04* 0.603 0.021

1 *p<0.05; **p<0.01; ***p<0.001

2 HR = Hazard Ratio, SE = Standard Error

plot(tbl)

ggcoef_model(mod1, exponentiate = TRUE)

mod2 <- step(mod1)
## Start:  AIC=2026.96
## Surv(time, deces) ~ sexe + milieu + richesse + structure + educ2 + 
##     gp_age_mere_naissance + rang_apres_ideal
## 
##                         Df    AIC
## - structure              4 2021.9
## - richesse               4 2022.6
## - educ2                  2 2023.3
## - gp_age_mere_naissance  2 2024.5
## <none>                     2027.0
## - rang_apres_ideal       1 2028.7
## - milieu                 1 2031.2
## - sexe                   1 2047.1
## 
## Step:  AIC=2021.87
## Surv(time, deces) ~ sexe + milieu + richesse + educ2 + gp_age_mere_naissance + 
##     rang_apres_ideal
## 
##                         Df    AIC
## - richesse               4 2016.9
## - educ2                  2 2018.1
## - gp_age_mere_naissance  2 2019.3
## <none>                     2021.9
## - rang_apres_ideal       1 2023.4
## - milieu                 1 2025.4
## - sexe                   1 2042.1
## 
## Step:  AIC=2016.86
## Surv(time, deces) ~ sexe + milieu + educ2 + gp_age_mere_naissance + 
##     rang_apres_ideal
## 
##                         Df    AIC
## - educ2                  2 2013.1
## - gp_age_mere_naissance  2 2014.7
## <none>                     2016.9
## - rang_apres_ideal       1 2018.0
## - milieu                 1 2018.6
## - sexe                   1 2037.4
## 
## Step:  AIC=2013.13
## Surv(time, deces) ~ sexe + milieu + gp_age_mere_naissance + rang_apres_ideal
## 
##                         Df    AIC
## - gp_age_mere_naissance  2 2011.0
## <none>                     2013.1
## - rang_apres_ideal       1 2014.3
## - milieu                 1 2015.6
## - sexe                   1 2033.6
## 
## Step:  AIC=2011.03
## Surv(time, deces) ~ sexe + milieu + rang_apres_ideal
## 
##                    Df    AIC
## <none>                2011.0
## - rang_apres_ideal  1 2012.3
## - milieu            1 2013.8
## - sexe              1 2031.4
ggcoef_model(mod2, exponentiate = TRUE)

ggforest(mod2)
## Warning in .get_data(model, data = data): The `data` argument is not provided.
## Data will be extracted from model fit.

## Validité du modèle

test <- cox.zph(mod2)
test
##                   chisq df    p
## sexe             0.2510  1 0.62
## milieu           0.0129  1 0.91
## rang_apres_ideal 0.7758  1 0.38
## GLOBAL           1.0467  3 0.79
ggcoxzph(test)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values