Homework 1 code.qmd

library(readxl)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.4.2     ✔ purrr   1.0.1
✔ tibble  3.2.1     ✔ dplyr   1.1.2
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.2     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
parliamentarians <- read_excel("~/Downloads/Parliamentarians.xlsx")

# filter by whether the character string "MP" is in the variable `Type of Parliamentarian`
parliamentarians <- parliamentarians |>
  filter(str_detect(`Type of Parliamentarian`, "MP"))

# create (mutate) variables for start and end date of the parliamentarian
# str_sub(x,5,14) returns the values between positions 5 and 14
# e.g. str_sub("abcdefghytr56",3,10) returns "cdefghyt"
# yms() coerces dates-as-string to date format 
parliamentarians <- parliamentarians |>
  mutate(date_started = ymd(str_sub(`Type of Parliamentarian`, 5, 14))) |>
  mutate(date_ended = str_sub(`Type of Parliamentarian`, 18, 27))

# Clean and extract party and province. These are regular expressions and it's fine if it's not clear. Regular expression extract/replace data based on patterns.
# If you are curious, take a look at this:
# https://jfjelstul.github.io/regular-expressions-tutorial/
parliamentarians <- parliamentarians |>
  mutate(party = trimws(str_extract(`Political Affiliation`, "[^\\()]+")),
         province = str_extract(`Province/Territory`, "[^\\\r]+"))

# Change format of date to date format
parliamentarians <- parliamentarians |>
  mutate(date_ended = ymd(date_ended, quiet = TRUE))
# If no end date, Parliamentarian is still in HoC so put today as end date.
# If you are very curious, look up difference between ifelse() and if_else()
parliamentarians <- parliamentarians |>
  mutate(date_ended = if_else(is.na(date_ended), ymd(today()), date_ended))

# Keep only those parties, let's not keep the small parties / indps
names_keep <- c("Conservative", "Conservative Party of Canada", "Liberal Party of Canada", 
                "New Democratic Party", "Progressive Conservative Party")
# Keep only variables we need
# Delete rows where missing values
parliamentarians <- parliamentarians |>
  filter(party %in% names_keep) |>
  select(Name, Gender, party, date_started, date_ended, province) |>
  na.omit()

##### END OF CLEANING

# Make Figure 1
ggplot(parliamentarians,aes(x=Gender)) +
  geom_bar()

# Save Figure 1
#ggsave("~/Desktop/figure1.png")

to_plot <- parliamentarians |>
  group_by(Gender) |>
  summarise(count = n())

ggplot(to_plot,aes(x=Gender,y=count)) +
  geom_bar(stat='identity')

#ggsave("~/Desktop/figure2.png",width=8,height=6)

parliamentarians <- parliamentarians |>
  mutate(year = year(date_started),
         year_period = cut(year, breaks = c(1860, 1900, 1950, 2000, 2025), dig.lab = 4))
ggplot(parliamentarians,aes(x=Gender)) +
  geom_bar() +
  facet_wrap(~year_period) + 
  scale_y_continuous(limits=c(0,1300))

#ggsave("~/Desktop/figure3.png")