# WEBIN-R #16 : Analyses de séquences

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3.9000     v purrr   0.3.4     
## v tibble  3.1.1          v dplyr   1.0.5     
## v tidyr   1.1.3          v stringr 1.4.0     
## v readr   1.4.0          v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(labelled)
library(TraMineR)
## 
## TraMineR stable version 2.2-1 (Built: 2020-11-02)
## Website: http://traminer.unige.ch
## Please type 'citation("TraMineR")' for citation information.
library(tidyr)

donnees <- read_csv("trajpro.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double()
## )
## i Use `spec()` for the full column specifications.
donnees$generation <- factor(
  donnees$generation,
  levels = 1:3,
  labels = c("1930-1938", "1939-1945", "1946-1950")
)

labels <- c(
  agric = "agriculteurs exploitants", # 1
  acce = "artisans, commercants et chefs d'entreprise", # 2
  cadr = "cadres et professions intellectuelles supérieures", # 3
  pint = "professions intermédiaires", # 4
  empl = "employés", # 5
  ouvr = "ouvriers", # 6
  etud = "études", # 7
  inact = "inactivité", # 8
  smil = "service militaire" # 9
)

seq <- seqdef(
  donnees %>% select(csp1:csp37),
  alphabet = 1:9,
  states = names(labels),
  labels = names(labels)
)
##  [>] state coding:
##        [alphabet]  [label]  [long label]
##      1             1agric    agric
##      2             2acce     acce
##      3             3cadr     cadr
##      4             4pint     pint
##      5             5empl     empl
##      6             6ouvr     ouvr
##      7             7etud     etud
##      8             8inact    inact
##      9             9smil     smil
##  [>] 1000 sequences in the data set
##  [>] min/max sequence length: 37/37
seq.om <- seqdist(seq, method = "LCS") %>%
  as.dist()
##  [>] 1000 sequences with 9 distinct states
##  [>] creating a 'sm' with a substitution cost of 2
##  [>] creating 9x9 substitution-cost matrix using 2 as constant value
##  [>] 818 distinct  sequences
##  [>] min/max sequence lengths: 37/37
##  [>] computing distances using the LCS metric
##  [>] elapsed time: 1.88 secs
seq.arbre <- hclust(seq.om, method = "ward.D2")

plot(seq.arbre)

seq.arbre$height %>% 
  sort(decreasing = TRUE) %>% 
  head(20) %>%
  plot(type = "s")

seq.part <- cutree(seq.arbre, k = 5) %>%
  factor(levels = 1:5, labels = paste("Classe", 1:5))

questionr::freq(seq.part)
##            n    % val%
## Classe 1 366 36.6 36.6
## Classe 2 183 18.3 18.3
## Classe 3 104 10.4 10.4
## Classe 4 296 29.6 29.6
## Classe 5  51  5.1  5.1
seqdplot(seq, group = seq.part, xtlab = 14:50)

ordre <- seq.om %>% cmdscale(k = 1)
seqIplot(seq, group = seq.part, xtlab = 14:50, sortv = ordre)

library(seqhandbook)
seq_heatmap(seq, seq.arbre, labCol = 14:50)

donnees$id <- 1:nrow(donnees)
donnees$classe <- seq.part
donnees$ordre <- ordre %>% rank(ties.method = "random")

long <- donnees %>%
  pivot_longer(
    cols = csp1:csp37,
    names_to = "annee",
    values_to = "csp"
  )

long$csp <- factor(
  long$csp,
  levels = 1:9,
  labels = labels
)

long$age <- long$annee %>%
  str_sub(start = 4) %>%
  as.integer() + 13

ggplot(long) +
  aes(x = age, y = factor(ordre), fill = csp) +
  geom_raster() +
  theme_bw() +
  scale_fill_brewer(palette = "Set3") +
  facet_grid(rows = vars(classe), scales = "free_y", space = "free_y") +
  scale_y_discrete(label = NULL) +
  scale_x_continuous(
    limits = c(14, 50), 
    breaks = c(14, 20, 25, 30, 35, 40, 45, 50)
  )
## Warning: Removed 2000 rows containing missing values (geom_raster).

seqfplot(seq, group = seq.part, xtlab = 14:50)

seqmsplot(seq, group = seq.part, xtlab = 14:50)

seqmtplot(seq, group = seq.part, xtlab = 14:50)
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique

seqrplot(seq, group = seq.part, xtlab = 14:50, dist.matrix = seq.om, method = "dist")
##  [>] number of objects (sum of weights): 366
##  [>] max. distance: 74
##  [>] neighborhood radius: 7.4
##  [>] 4 representative(s) selected, coverage=28% (threshold=25%)
##  [>] 343 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "method" n'est pas un paramètre graphique
##  [>] number of objects (sum of weights): 183
##  [>] max. distance: 74
##  [>] neighborhood radius: 7.4
##  [>] 1 representative(s) selected, coverage=57% (threshold=25%)
##  [>] 121 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
##  [>] number of objects (sum of weights): 104
##  [>] max. distance: 74
##  [>] neighborhood radius: 7.4
##  [>] 3 representative(s) selected, coverage=30% (threshold=25%)
##  [>] 99 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
##  [>] number of objects (sum of weights): 296
##  [>] max. distance: 74
##  [>] neighborhood radius: 7.4
##  [>] 1 representative(s) selected, coverage=48% (threshold=25%)
##  [>] 204 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
##  [>] number of objects (sum of weights): 51
##  [>] max. distance: 74
##  [>] neighborhood radius: 7.4
##  [>] 6 representative(s) selected, coverage=27% (threshold=25%)
##  [>] 51 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

seqHtplot(seq, group = seq.part, xtlab = 14:50)

library(gtsummary)
donnees %>%
  select(generation, classe) %>%
  tbl_summary(by = classe, percent = "row") %>%
  add_p()
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Characteristic Classe 1, N = 3661 Classe 2, N = 1831 Classe 3, N = 1041 Classe 4, N = 2961 Classe 5, N = 511 p-value2
generation 0.018
1930-1938 121 (36%) 67 (20%) 22 (6.5%) 108 (32%) 22 (6.5%)
1939-1945 96 (33%) 54 (18%) 41 (14%) 86 (29%) 18 (6.1%)
1946-1950 149 (41%) 62 (17%) 41 (11%) 102 (28%) 11 (3.0%)

1 n (%)

2 Pearson's Chi-squared test