# install_github("pachadotdev/tintin")
library(dplyr)
library(tidyr)
library(readxl)
library(countrycode)
library(ggplot2)
library(stringr)
library(tintin)
Tidying the Freedom Index
R and Shiny Training: If you find this blog to be interesting, please note that I offer personalized and group-based training sessions that may be reserved through Buy me a Coffee. Additionally, I provide training services in the Spanish language and am available to discuss means by which I may contribute to your Shiny project.
Motivation
I needed to create a few plots by using the Civil Liberties and Political Rights scores from the Freedom Index. However, the data provided in Excel format was not in an easy to use presentation. This blog post shows how I reshaped the data to make it easier to work with.
There is an excellent post, Cleaning Freedom House indicators, by Marta Kolczynska that I used as reference. The post is four years old, so here I am using some updated functions and I tried to make some steps more general.
Download and read the data
Required packages:
Download the data:
<- "https://freedomhouse.org/sites/default/files/2023-02/Country_and_Territory_Ratings_and_Statuses_FIW_1973-2023%20.xlsx"
url <- gsub("%20", "", gsub(".*/", "", url))
raw_xlsx
if (!file.exists(raw_xlsx)) {
download.file(url, raw_xlsx)
}
After opening the Excel file with Libre Office I realized I need to read the data from the second sheet and recode “-” to missing:
<- read_excel(raw_xlsx, sheet = 2, na = "-")
freedom_house freedom_house
# A tibble: 207 × 151
`Survey Edition` `Jan.-Feb. 1973` ...3 ...4 `Jan.-Feb. 1974` ...6 ...7
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Year(s) Under Revi… 1972 <NA> <NA> 1973 <NA> <NA>
2 <NA> PR CL Stat… PR CL Stat…
3 Afghanistan 4 5 PF 7 6 NF
4 Albania 7 7 NF 7 7 NF
5 Algeria 6 6 NF 6 6 NF
6 Andorra 4 3 PF 4 4 PF
7 Angola <NA> <NA> <NA> <NA> <NA> <NA>
8 Antigua and Barbuda <NA> <NA> <NA> <NA> <NA> <NA>
9 Argentina 6 3 PF 2 2 F
10 Armenia <NA> <NA> <NA> <NA> <NA> <NA>
# ℹ 197 more rows
# ℹ 144 more variables: `Jan.-Feb. 1975` <chr>, ...9 <chr>, ...10 <chr>,
# `Jan.-Feb. 1976` <chr>, ...12 <chr>, ...13 <chr>, `Jan.-Feb. 1977` <chr>,
# ...15 <chr>, ...16 <chr>, `1978` <chr>, ...18 <chr>, ...19 <chr>,
# `1979` <chr>, ...21 <chr>, ...22 <chr>, `1980` <chr>, ...24 <chr>,
# ...25 <chr>, `1981` <chr>, ...27 <chr>, ...28 <chr>, `1982` <chr>,
# ...30 <chr>, ...31 <chr>, `1983-84` <chr>, ...33 <chr>, ...34 <chr>, …
Tidy the data
Fix the first column name:
names(freedom_house)[1] <- "country"
freedom_house
# A tibble: 207 × 151
country `Jan.-Feb. 1973` ...3 ...4 `Jan.-Feb. 1974` ...6 ...7
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Year(s) Under Revi… 1972 <NA> <NA> 1973 <NA> <NA>
2 <NA> PR CL Stat… PR CL Stat…
3 Afghanistan 4 5 PF 7 6 NF
4 Albania 7 7 NF 7 7 NF
5 Algeria 6 6 NF 6 6 NF
6 Andorra 4 3 PF 4 4 PF
7 Angola <NA> <NA> <NA> <NA> <NA> <NA>
8 Antigua and Barbuda <NA> <NA> <NA> <NA> <NA> <NA>
9 Argentina 6 3 PF 2 2 F
10 Armenia <NA> <NA> <NA> <NA> <NA> <NA>
# ℹ 197 more rows
# ℹ 144 more variables: `Jan.-Feb. 1975` <chr>, ...9 <chr>, ...10 <chr>,
# `Jan.-Feb. 1976` <chr>, ...12 <chr>, ...13 <chr>, `Jan.-Feb. 1977` <chr>,
# ...15 <chr>, ...16 <chr>, `1978` <chr>, ...18 <chr>, ...19 <chr>,
# `1979` <chr>, ...21 <chr>, ...22 <chr>, `1980` <chr>, ...24 <chr>,
# ...25 <chr>, `1981` <chr>, ...27 <chr>, ...28 <chr>, `1982` <chr>,
# ...30 <chr>, ...31 <chr>, `1983-84` <chr>, ...33 <chr>, ...34 <chr>, …
Remove the first and second row of data:
<- freedom_house %>%
freedom_house filter(
!= "Year(s) Under Review",
country !is.na(country)
)
freedom_house
# A tibble: 205 × 151
country `Jan.-Feb. 1973` ...3 ...4 `Jan.-Feb. 1974` ...6 ...7
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Afghanistan 4 5 PF 7 6 NF
2 Albania 7 7 NF 7 7 NF
3 Algeria 6 6 NF 6 6 NF
4 Andorra 4 3 PF 4 4 PF
5 Angola <NA> <NA> <NA> <NA> <NA> <NA>
6 Antigua and Barbuda <NA> <NA> <NA> <NA> <NA> <NA>
7 Argentina 6 3 PF 2 2 F
8 Armenia <NA> <NA> <NA> <NA> <NA> <NA>
9 Australia 1 1 F 1 1 F
10 Austria 1 1 F 1 1 F
# ℹ 195 more rows
# ℹ 144 more variables: `Jan.-Feb. 1975` <chr>, ...9 <chr>, ...10 <chr>,
# `Jan.-Feb. 1976` <chr>, ...12 <chr>, ...13 <chr>, `Jan.-Feb. 1977` <chr>,
# ...15 <chr>, ...16 <chr>, `1978` <chr>, ...18 <chr>, ...19 <chr>,
# `1979` <chr>, ...21 <chr>, ...22 <chr>, `1980` <chr>, ...24 <chr>,
# ...25 <chr>, `1981` <chr>, ...27 <chr>, ...28 <chr>, `1982` <chr>,
# ...30 <chr>, ...31 <chr>, `1983-84` <chr>, ...33 <chr>, ...34 <chr>, …
Convert the dataset to long format:
<- freedom_house %>%
freedom_house pivot_longer(cols = -country, names_to = "year", values_to = "value")
freedom_house
# A tibble: 30,750 × 3
country year value
<chr> <chr> <chr>
1 Afghanistan Jan.-Feb. 1973 4
2 Afghanistan ...3 5
3 Afghanistan ...4 PF
4 Afghanistan Jan.-Feb. 1974 7
5 Afghanistan ...6 6
6 Afghanistan ...7 NF
7 Afghanistan Jan.-Feb. 1975 7
8 Afghanistan ...9 6
9 Afghanistan ...10 NF
10 Afghanistan Jan.-Feb. 1976 7
# ℹ 30,740 more rows
Fix the year by removing all non-numeric characters in the year column (i.e., the 2023 survey reflects the scores for 2022):
<- freedom_house %>%
freedom_house mutate(
year = as.integer(gsub("[^0-9]", "", year)) - 1L,
year = case_when(
< 1972 ~ NA_integer_,
year TRUE ~ year
)
)
freedom_house
# A tibble: 30,750 × 3
country year value
<chr> <int> <chr>
1 Afghanistan 1972 4
2 Afghanistan NA 5
3 Afghanistan NA PF
4 Afghanistan 1973 7
5 Afghanistan NA 6
6 Afghanistan NA NF
7 Afghanistan 1974 7
8 Afghanistan NA 6
9 Afghanistan NA NF
10 Afghanistan 1975 7
# ℹ 30,740 more rows
Replace blank years:
<- freedom_house %>%
freedom_house fill(year)
freedom_house
# A tibble: 30,750 × 3
country year value
<chr> <int> <chr>
1 Afghanistan 1972 4
2 Afghanistan 1972 5
3 Afghanistan 1972 PF
4 Afghanistan 1973 7
5 Afghanistan 1973 6
6 Afghanistan 1973 NF
7 Afghanistan 1974 7
8 Afghanistan 1974 6
9 Afghanistan 1974 NF
10 Afghanistan 1975 7
# ℹ 30,740 more rows
Add measurement categories:
<- freedom_house %>%
freedom_house group_by(country, year) %>%
mutate(
n = row_number(),
category = case_when(
== 1 ~ "political_rights",
n == 2 ~ "civil_liberties",
n == 3 ~ "status"
n
)%>%
) ungroup()
freedom_house
# A tibble: 30,750 × 5
country year value n category
<chr> <int> <chr> <int> <chr>
1 Afghanistan 1972 4 1 political_rights
2 Afghanistan 1972 5 2 civil_liberties
3 Afghanistan 1972 PF 3 status
4 Afghanistan 1973 7 1 political_rights
5 Afghanistan 1973 6 2 civil_liberties
6 Afghanistan 1973 NF 3 status
7 Afghanistan 1974 7 1 political_rights
8 Afghanistan 1974 6 2 civil_liberties
9 Afghanistan 1974 NF 3 status
10 Afghanistan 1975 7 1 political_rights
# ℹ 30,740 more rows
Convert the data to wide format because the value column is not tidy (i.e., it contains different units in the same column):
<- freedom_house %>%
freedom_house select(-n) %>%
pivot_wider(names_from = category, values_from = value)
freedom_house
# A tibble: 10,250 × 5
country year political_rights civil_liberties status
<chr> <int> <chr> <chr> <chr>
1 Afghanistan 1972 4 5 PF
2 Afghanistan 1973 7 6 NF
3 Afghanistan 1974 7 6 NF
4 Afghanistan 1975 7 6 NF
5 Afghanistan 1976 7 6 NF
6 Afghanistan 1977 6 6 NF
7 Afghanistan 1978 7 7 NF
8 Afghanistan 1979 7 7 NF
9 Afghanistan 1980 7 7 NF
10 Afghanistan 1981 7 7 NF
# ℹ 10,240 more rows
Convert political rights and civil liberties to integer (i.e., for South Africa in 1973 I considered the worst score, which is inside parenthesis, because the index was separated into White and Black populations):
<- function(x) {
remove_parenthesis <- str_extract(x, "\\((.*?)\\)")
y <- str_replace_all(y, "\\(|\\)", "")
y return(y)
}
<- freedom_house %>%
freedom_house mutate(
political_rights = case_when(
== "South Africa" & year == 1973 ~ remove_parenthesis(political_rights),
country TRUE ~ political_rights
),civil_liberties = case_when(
== "South Africa" & year == 1973 ~ remove_parenthesis(civil_liberties),
country TRUE ~ civil_liberties
),status = case_when(
== "South Africa" & year == 1973 ~ remove_parenthesis(status),
country TRUE ~ status
),
political_rights = as.integer(political_rights),
civil_liberties = as.integer(civil_liberties)
)
Warning: There were 2 warnings in `mutate()`.
The first warning was:
ℹ In argument: `political_rights = as.integer(political_rights)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
%>%
freedom_house filter(country == "South Africa", year == 1973)
# A tibble: 1 × 5
country year political_rights civil_liberties status
<chr> <int> <int> <int> <chr>
1 South Africa 1973 NA NA <NA>
freedom_house
# A tibble: 10,250 × 5
country year political_rights civil_liberties status
<chr> <int> <int> <int> <chr>
1 Afghanistan 1972 4 5 PF
2 Afghanistan 1973 7 6 NF
3 Afghanistan 1974 7 6 NF
4 Afghanistan 1975 7 6 NF
5 Afghanistan 1976 7 6 NF
6 Afghanistan 1977 6 6 NF
7 Afghanistan 1978 7 7 NF
8 Afghanistan 1979 7 7 NF
9 Afghanistan 1980 7 7 NF
10 Afghanistan 1981 7 7 NF
# ℹ 10,240 more rows
Recode the status to make it more readable:
<- freedom_house %>%
freedom_house mutate(
status = factor(
case_when(
== "F" ~ "Free",
status == "PF" ~ "Partially Free",
status == "NF" ~ "Not Free"
status
),levels = c("Free", "Partially Free", "Not Free"),
)
)
freedom_house
# A tibble: 10,250 × 5
country year political_rights civil_liberties status
<chr> <int> <int> <int> <fct>
1 Afghanistan 1972 4 5 Partially Free
2 Afghanistan 1973 7 6 Not Free
3 Afghanistan 1974 7 6 Not Free
4 Afghanistan 1975 7 6 Not Free
5 Afghanistan 1976 7 6 Not Free
6 Afghanistan 1977 6 6 Not Free
7 Afghanistan 1978 7 7 Not Free
8 Afghanistan 1979 7 7 Not Free
9 Afghanistan 1980 7 7 Not Free
10 Afghanistan 1981 7 7 Not Free
# ℹ 10,240 more rows
Do some verifications:
%>%
freedom_house filter(is.na(country))
# A tibble: 0 × 5
# ℹ 5 variables: country <chr>, year <int>, political_rights <int>,
# civil_liberties <int>, status <fct>
%>%
freedom_house filter(is.na(year))
# A tibble: 0 × 5
# ℹ 5 variables: country <chr>, year <int>, political_rights <int>,
# civil_liberties <int>, status <fct>
%>%
freedom_house filter(is.na(political_rights))
# A tibble: 1,207 × 5
country year political_rights civil_liberties status
<chr> <int> <int> <int> <fct>
1 Andorra 1977 NA NA <NA>
2 Andorra 1978 NA NA <NA>
3 Andorra 1979 NA NA <NA>
4 Andorra 1980 NA NA <NA>
5 Andorra 1981 NA NA <NA>
6 Andorra 198383 NA NA <NA>
7 Andorra 198484 NA NA <NA>
8 Andorra 198585 NA NA <NA>
9 Andorra 198686 NA NA <NA>
10 Andorra 198787 NA NA <NA>
# ℹ 1,197 more rows
Fix the year variable because some years were formatted as “198384” (i.e, 1983-84):
<- freedom_house %>%
freedom_house mutate(
year = case_when(
nchar(year) > 4 ~ as.integer(substr(year, 1, 4)),
TRUE ~ year
)
)
%>%
freedom_house filter(is.na(political_rights))
# A tibble: 1,207 × 5
country year political_rights civil_liberties status
<chr> <int> <int> <int> <fct>
1 Andorra 1977 NA NA <NA>
2 Andorra 1978 NA NA <NA>
3 Andorra 1979 NA NA <NA>
4 Andorra 1980 NA NA <NA>
5 Andorra 1981 NA NA <NA>
6 Andorra 1983 NA NA <NA>
7 Andorra 1984 NA NA <NA>
8 Andorra 1985 NA NA <NA>
9 Andorra 1986 NA NA <NA>
10 Andorra 1987 NA NA <NA>
# ℹ 1,197 more rows
Discard country-year combinations with missing values:
<- freedom_house %>%
freedom_house drop_na(political_rights, civil_liberties, status)
freedom_house
# A tibble: 9,043 × 5
country year political_rights civil_liberties status
<chr> <int> <int> <int> <fct>
1 Afghanistan 1972 4 5 Partially Free
2 Afghanistan 1973 7 6 Not Free
3 Afghanistan 1974 7 6 Not Free
4 Afghanistan 1975 7 6 Not Free
5 Afghanistan 1976 7 6 Not Free
6 Afghanistan 1977 6 6 Not Free
7 Afghanistan 1978 7 7 Not Free
8 Afghanistan 1979 7 7 Not Free
9 Afghanistan 1980 7 7 Not Free
10 Afghanistan 1981 7 7 Not Free
# ℹ 9,033 more rows
Additional steps
The data is now tidy, but adding ISO-2 and ISO-3 codes is highly convenient, so I create a table to join it with the data later:
<- freedom_house %>%
countries ungroup() %>%
distinct(country) %>%
mutate(
iso2c = countrycode(
country,origin = "country.name",
destination = "iso2c"
),iso3c = countrycode(
country,origin = "country.name",
destination = "iso3c"
) )
Warning: There were 2 warnings in `mutate()`.
The first warning was:
ℹ In argument: `iso2c = countrycode(country, origin = "country.name",
destination = "iso2c")`.
Caused by warning in `countrycode_convert()`:
! Some values were not matched unambiguously: Czechoslovakia, Kosovo, Micronesia, Serbia and Montenegro, Yugoslavia
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
countries
# A tibble: 205 × 3
country iso2c iso3c
<chr> <chr> <chr>
1 Afghanistan AF AFG
2 Albania AL ALB
3 Algeria DZ DZA
4 Andorra AD AND
5 Angola AO AGO
6 Antigua and Barbuda AG ATG
7 Argentina AR ARG
8 Armenia AM ARM
9 Australia AU AUS
10 Austria AT AUT
# ℹ 195 more rows
Add continent:
<- countries %>%
countries mutate(
continent = countrycode(
iso3c,origin = "iso3c",
destination = "continent"
)
)
countries
# A tibble: 205 × 4
country iso2c iso3c continent
<chr> <chr> <chr> <chr>
1 Afghanistan AF AFG Asia
2 Albania AL ALB Europe
3 Algeria DZ DZA Africa
4 Andorra AD AND Europe
5 Angola AO AGO Africa
6 Antigua and Barbuda AG ATG Americas
7 Argentina AR ARG Americas
8 Armenia AM ARM Asia
9 Australia AU AUS Oceania
10 Austria AT AUT Europe
# ℹ 195 more rows
Because some ISO codes could not be matched unambiguously, for some countries I need to add the continent manually:
%>%
countries filter(is.na(continent))
# A tibble: 5 × 4
country iso2c iso3c continent
<chr> <chr> <chr> <chr>
1 Czechoslovakia <NA> <NA> <NA>
2 Kosovo <NA> <NA> <NA>
3 Micronesia <NA> <NA> <NA>
4 Serbia and Montenegro <NA> <NA> <NA>
5 Yugoslavia <NA> <NA> <NA>
<- countries %>%
countries mutate(
continent = case_when(
== "Czechoslovakia" ~ "Europe",
country == "Kosovo" ~ "Europe",
country == "Micronesia" ~ "Oceania",
country == "Serbia and Montenegro" ~ "Europe",
country == "Yugoslavia" ~ "Europe",
country TRUE ~ continent
) )
Join the two data sets:
<- freedom_house %>%
freedom_house left_join(countries, by = "country") %>%
select(year, country, iso2c, iso3c, continent, political_rights, civil_liberties, status)
freedom_house
# A tibble: 9,043 × 8
year country iso2c iso3c continent political_rights civil_liberties status
<int> <chr> <chr> <chr> <chr> <int> <int> <fct>
1 1972 Afghanis… AF AFG Asia 4 5 Parti…
2 1973 Afghanis… AF AFG Asia 7 6 Not F…
3 1974 Afghanis… AF AFG Asia 7 6 Not F…
4 1975 Afghanis… AF AFG Asia 7 6 Not F…
5 1976 Afghanis… AF AFG Asia 7 6 Not F…
6 1977 Afghanis… AF AFG Asia 6 6 Not F…
7 1978 Afghanis… AF AFG Asia 7 7 Not F…
8 1979 Afghanis… AF AFG Asia 7 7 Not F…
9 1980 Afghanis… AF AFG Asia 7 7 Not F…
10 1981 Afghanis… AF AFG Asia 7 7 Not F…
# ℹ 9,033 more rows
Add colours for the status (i.e., here I avoided colours such as red and green because they are linked to positive and negative associations):
<- freedom_house %>%
freedom_house mutate(
color = case_when(
== "Free" ~ "#549f95",
status == "Partially Free" ~ "#a1aafc",
status == "Not Free" ~ "#7454a6"
status
) )
Final result
The final result is a tidy dataset that can be used to create plots and perform analysis:
freedom_house
# A tibble: 9,043 × 9
year country iso2c iso3c continent political_rights civil_liberties status
<int> <chr> <chr> <chr> <chr> <int> <int> <fct>
1 1972 Afghanis… AF AFG Asia 4 5 Parti…
2 1973 Afghanis… AF AFG Asia 7 6 Not F…
3 1974 Afghanis… AF AFG Asia 7 6 Not F…
4 1975 Afghanis… AF AFG Asia 7 6 Not F…
5 1976 Afghanis… AF AFG Asia 7 6 Not F…
6 1977 Afghanis… AF AFG Asia 6 6 Not F…
7 1978 Afghanis… AF AFG Asia 7 7 Not F…
8 1979 Afghanis… AF AFG Asia 7 7 Not F…
9 1980 Afghanis… AF AFG Asia 7 7 Not F…
10 1981 Afghanis… AF AFG Asia 7 7 Not F…
# ℹ 9,033 more rows
# ℹ 1 more variable: color <chr>
As an example, I can create a plot of the political rights and civil liberties scores in my country, Chile, where between 1973 and 1990 the country was under a dictatorship and then transitioned to democracy:
<- freedom_house %>%
dplot filter(country == "Chile", year >= 1973) %>%
mutate(
political_rights = abs(political_rights - 8),
civil_liberties = abs(civil_liberties - 8)
%>%
) pivot_longer(cols = c("political_rights", "civil_liberties"),
names_to = "category", values_to = "value") %>%
mutate(category = str_to_title(str_replace_all(category, "_", " ")))
ggplot(dplot) +
geom_line(aes(x = year, y = value, color = category),
linewidth = 1.2) +
facet_wrap(~category, nrow = 2) +
theme_minimal(base_size = 13) +
theme(legend.position = "none") +
labs(
title = "Political Rights and Civil Liberties in Chile",
subtitle = "Source: Freedom House Index.\nNote: Scores reversed for visual clarity (1 = least free, 7 = most free).",
x = "Year",
y = "Score"
+
) scale_colour_tintin_d(option = "tintin_in_the_land_of_the_soviets")