# Classification ascendante hiérarchique (CAH)

# Charger les données

library(questionr)
data(hdv2003)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3.9000     v purrr   0.3.4     
## v tibble  3.1.0          v dplyr   1.0.5     
## v tidyr   1.1.3          v stringr 1.4.0     
## v readr   1.4.0          v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gtsummary)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Recodage de hdv2003$age en hdv2003$groupe_age
hdv2003$groupe_age <- cut(hdv2003$age,
  include.lowest = TRUE,
  right = FALSE,
  dig.lab = 4,
  breaks = c(18, 25, 45, 65, 97)
)
## Recodage de hdv2003$nivetud en hdv2003$etud
hdv2003$etud <- fct_recode(hdv2003$nivetud,
  "Primaire" = "N'a jamais fait d'etudes",
  "Primaire" = "A arrete ses etudes, avant la derniere annee d'etudes primaires",
  "Primaire" = "Derniere annee d'etudes primaires",
  "Secondaire" = "1er cycle",
  "Secondaire" = "2eme cycle",
  "Technique/Pro" = "Enseignement technique ou professionnel court",
  "Technique/Pro" = "Enseignement technique ou professionnel long",
  "Supérieur" = "Enseignement superieur y compris technique superieur"
)
hdv2003$etud <- fct_explicit_na(hdv2003$etud, "Non documenté")

d2 <- hdv2003 %>%
  select(groupe_age, sexe, etud, peche.chasse, cinema, cuisine, bricol, sport, lecture.bd)

library(ade4)
acm <- dudi.acm(d2, scannf = FALSE, nf = Inf)

# calcul de la matrice de distance
md <- dist.dudi(acm)

# calcul de la matrice des distances de Gower
library(cluster)
md_gower <- daisy(d2, metric = "gower")

# calcul du dendrogramme
arbre <- hclust(md, method = "ward.D2")
arbre_gower <- hclust(md_gower, method = "ward.D2")

# Représenter le dendrogramme
plot(arbre, labels = FALSE)
rect.hclust(arbre, 2, border = "red")
rect.hclust(arbre, 5, border = "blue")

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.14.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
color_branches(arbre, k = 5) %>% ggplot(labels = FALSE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(arbre, k = 5, show_labels = FALSE, rect = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

# saut d'inertie
inertie <- sort(arbre$height, decreasing = TRUE)
plot(inertie[1:20], type = "s")

inertie_gower <- sort(arbre_gower$height, decreasing = TRUE)
plot(inertie_gower[1:10], type = "s")

source(url("https://raw.githubusercontent.com/larmarange/JLutils/master/R/clustering.R"))
best.cutree(arbre_gower, graph = TRUE)

## [1] 3
best.cutree(arbre, graph = TRUE)

## [1] 3
library(WeightedCluster)
## Loading required package: TraMineR
## 
## TraMineR stable version 2.2-1 (Built: 2020-11-02)
## Website: http://traminer.unige.ch
## Please type 'citation("TraMineR")' for citation information.
## This is WeightedCluster stable version 1.4-1 (Built: 2020-07-07)
## 
## To get the manuals, please run:
##    vignette("WeightedCluster") ## Complete manual in English
##    vignette("WeightedClusterFR") ## Complete manual in French
##    vignette("WeightedClusterPreview") ## Short preview in English
## 
## To cite WeightedCluster in publications please use:
## Studer, Matthias (2013). WeightedCluster Library Manual: A practical
##    guide to creating typologies of trajectories in the social sciences
##    with R. LIVES Working Papers, 24. doi:
##    10.12682/lives.2296-1658.2013.24
as.clustrange(arbre, md) %>% plot()

as.clustrange(arbre_gower, md_gower) %>% plot()

hdv2003$typo <- cutree(arbre, 3)
hdv2003$typo_gower <- cutree(arbre_gower, 3)

hdv2003 %>%
  tbl_summary(include = c(names(d2), "typo"), by = "typo")
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Characteristic 1, N = 1,2561 2, N = 1861 3, N = 5581
groupe_age
[18,25) 5 (0.4%) 164 (88%) 0 (0%)
[25,45) 662 (53%) 19 (10%) 25 (4.5%)
[45,65) 557 (44%) 3 (1.6%) 185 (33%)
[65,97] 32 (2.5%) 0 (0%) 348 (62%)
sexe
Homme 597 (48%) 84 (45%) 218 (39%)
Femme 659 (52%) 102 (55%) 340 (61%)
etud
Primaire 56 (4.5%) 1 (0.5%) 409 (73%)
Secondaire 307 (24%) 15 (8.1%) 65 (12%)
Technique/Pro 498 (40%) 48 (26%) 48 (8.6%)
Supérieur 391 (31%) 14 (7.5%) 36 (6.5%)
Non documenté 4 (0.3%) 108 (58%) 0 (0%)
peche.chasse
Non 1,048 (83%) 170 (91%) 558 (100%)
Oui 208 (17%) 16 (8.6%) 0 (0%)
cinema
Non 648 (52%) 41 (22%) 485 (87%)
Oui 608 (48%) 145 (78%) 73 (13%)
cuisine
Non 657 (52%) 100 (54%) 362 (65%)
Oui 599 (48%) 86 (46%) 196 (35%)
bricol
Non 627 (50%) 120 (65%) 400 (72%)
Oui 629 (50%) 66 (35%) 158 (28%)
sport
Non 731 (58%) 64 (34%) 482 (86%)
Oui 525 (42%) 122 (66%) 76 (14%)
lecture.bd
Non 1,209 (96%) 186 (100%) 558 (100%)
Oui 47 (3.7%) 0 (0%) 0 (0%)

1 n (%)

hdv2003$typo <- factor(hdv2003$typo)
hdv2003$typo_gower <- factor(hdv2003$typo_gower)

ggtable(
  hdv2003,
  columnsX = c("typo", "typo_gower"),
  columnsY = names(d2),
  cells = "col.prop",
  fill = "std.resid",
  legend = 1
)
## Warning in chisq.test(xtabs(weight ~ y + x, data = data)): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(xtabs(weight ~ y + x, data = data)): Chi-squared
## approximation may be incorrect

fviz_mca_ind(acm, geom = "point", alpha.ind = .25, habillage = hdv2003$typo, addEllipses = TRUE)