analyse_sequences.R

# WEBIN-R #16 : Analyses de séquences

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.3.9000     v purrr   0.3.4     
## v tibble  3.1.1          v dplyr   1.0.5     
## v tidyr   1.1.3          v stringr 1.4.0     
## v readr   1.4.0          v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(labelled)
library(TraMineR)

## 
## TraMineR stable version 2.2-1 (Built: 2020-11-02)

## Website: http://traminer.unige.ch

## Please type 'citation("TraMineR")' for citation information.

library(tidyr)

donnees <- read_csv("trajpro.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double()
## )
## i Use `spec()` for the full column specifications.

donnees$generation <- factor(
  donnees$generation,
  levels = 1:3,
  labels = c("1930-1938", "1939-1945", "1946-1950")
)

labels <- c(
  agric = "agriculteurs exploitants", # 1
  acce = "artisans, commercants et chefs d'entreprise", # 2
  cadr = "cadres et professions intellectuelles supérieures", # 3
  pint = "professions intermédiaires", # 4
  empl = "employés", # 5
  ouvr = "ouvriers", # 6
  etud = "études", # 7
  inact = "inactivité", # 8
  smil = "service militaire" # 9
)

seq <- seqdef(
  donnees %>% select(csp1:csp37),
  alphabet = 1:9,
  states = names(labels),
  labels = names(labels)
)

##  [>] state coding:

##        [alphabet]  [label]  [long label]

##      1             1agric    agric

##      2             2acce     acce

##      3             3cadr     cadr

##      4             4pint     pint

##      5             5empl     empl

##      6             6ouvr     ouvr

##      7             7etud     etud

##      8             8inact    inact

##      9             9smil     smil

##  [>] 1000 sequences in the data set

##  [>] min/max sequence length: 37/37

seq.om <- seqdist(seq, method = "LCS") %>%
  as.dist()

##  [>] 1000 sequences with 9 distinct states

##  [>] creating a 'sm' with a substitution cost of 2

##  [>] creating 9x9 substitution-cost matrix using 2 as constant value

##  [>] 818 distinct  sequences

##  [>] min/max sequence lengths: 37/37

##  [>] computing distances using the LCS metric

##  [>] elapsed time: 1.88 secs

seq.arbre <- hclust(seq.om, method = "ward.D2")

plot(seq.arbre)

seq.arbre$height %>% 
  sort(decreasing = TRUE) %>% 
  head(20) %>%
  plot(type = "s")

seq.part <- cutree(seq.arbre, k = 5) %>%
  factor(levels = 1:5, labels = paste("Classe", 1:5))

questionr::freq(seq.part)

##            n    % val%
## Classe 1 366 36.6 36.6
## Classe 2 183 18.3 18.3
## Classe 3 104 10.4 10.4
## Classe 4 296 29.6 29.6
## Classe 5  51  5.1  5.1

seqdplot(seq, group = seq.part, xtlab = 14:50)

ordre <- seq.om %>% cmdscale(k = 1)
seqIplot(seq, group = seq.part, xtlab = 14:50, sortv = ordre)

library(seqhandbook)
seq_heatmap(seq, seq.arbre, labCol = 14:50)

donnees$id <- 1:nrow(donnees)
donnees$classe <- seq.part
donnees$ordre <- ordre %>% rank(ties.method = "random")

long <- donnees %>%
  pivot_longer(
    cols = csp1:csp37,
    names_to = "annee",
    values_to = "csp"
  )

long$csp <- factor(
  long$csp,
  levels = 1:9,
  labels = labels
)

long$age <- long$annee %>%
  str_sub(start = 4) %>%
  as.integer() + 13

ggplot(long) +
  aes(x = age, y = factor(ordre), fill = csp) +
  geom_raster() +
  theme_bw() +
  scale_fill_brewer(palette = "Set3") +
  facet_grid(rows = vars(classe), scales = "free_y", space = "free_y") +
  scale_y_discrete(label = NULL) +
  scale_x_continuous(
    limits = c(14, 50), 
    breaks = c(14, 20, 25, 30, 35, 40, 45, 50)
  )

## Warning: Removed 2000 rows containing missing values (geom_raster).

seqfplot(seq, group = seq.part, xtlab = 14:50)

seqmsplot(seq, group = seq.part, xtlab = 14:50)

seqmtplot(seq, group = seq.part, xtlab = 14:50)

## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique

## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique

## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique

## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique

## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique

## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique

seqrplot(seq, group = seq.part, xtlab = 14:50, dist.matrix = seq.om, method = "dist")

##  [>] number of objects (sum of weights): 366

##  [>] max. distance: 74

##  [>] neighborhood radius: 7.4

##  [>] 4 representative(s) selected, coverage=28% (threshold=25%)

##  [>] 343 distinct sequence(s)

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "method" n'est pas un paramètre graphique

##  [>] number of objects (sum of weights): 183

##  [>] max. distance: 74

##  [>] neighborhood radius: 7.4

##  [>] 1 representative(s) selected, coverage=57% (threshold=25%)

##  [>] 121 distinct sequence(s)

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

##  [>] number of objects (sum of weights): 104

##  [>] max. distance: 74

##  [>] neighborhood radius: 7.4

##  [>] 3 representative(s) selected, coverage=30% (threshold=25%)

##  [>] 99 distinct sequence(s)

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

##  [>] number of objects (sum of weights): 296

##  [>] max. distance: 74

##  [>] neighborhood radius: 7.4

##  [>] 1 representative(s) selected, coverage=48% (threshold=25%)

##  [>] 204 distinct sequence(s)

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

##  [>] number of objects (sum of weights): 51

##  [>] max. distance: 74

##  [>] neighborhood radius: 7.4

##  [>] 6 representative(s) selected, coverage=27% (threshold=25%)

##  [>] 51 distinct sequence(s)

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique

seqHtplot(seq, group = seq.part, xtlab = 14:50)

library(gtsummary)
donnees %>%
  select(generation, classe) %>%
  tbl_summary(by = classe, percent = "row") %>%
  add_p()

## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.

Characteristic	Classe 1, N = 366¹	Classe 2, N = 183¹	Classe 3, N = 104¹	Classe 4, N = 296¹	Classe 5, N = 51¹	p-value²
generation						0.018
1930-1938	121 (36%)	67 (20%)	22 (6.5%)	108 (32%)	22 (6.5%)
1939-1945	96 (33%)	54 (18%)	41 (14%)	86 (29%)	18 (6.1%)
1946-1950	149 (41%)	62 (17%)	41 (11%)	102 (28%)	11 (3.0%)
¹ n (%) ² Pearson's Chi-squared test

analyse_sequences.R

josep

2021-05-06