parliamentarians <-read_excel("~/Downloads/Parliamentarians.xlsx")# filter by whether the character string "MP" is in the variable `Type of Parliamentarian`parliamentarians <- parliamentarians |>filter(str_detect(`Type of Parliamentarian`, "MP"))# create (mutate) variables for start and end date of the parliamentarian# str_sub(x,5,14) returns the values between positions 5 and 14# e.g. str_sub("abcdefghytr56",3,10) returns "cdefghyt"# yms() coerces dates-as-string to date format parliamentarians <- parliamentarians |>mutate(date_started =ymd(str_sub(`Type of Parliamentarian`, 5, 14))) |>mutate(date_ended =str_sub(`Type of Parliamentarian`, 18, 27))# Clean and extract party and province. These are regular expressions and it's fine if it's not clear. Regular expression extract/replace data based on patterns.# If you are curious, take a look at this:# <- parliamentarians |>mutate(party =trimws(str_extract(`Political Affiliation`, "[^\\()]+")),province =str_extract(`Province/Territory`, "[^\\\r]+"))# Change format of date to date formatparliamentarians <- parliamentarians |>mutate(date_ended =ymd(date_ended, quiet =TRUE))# If no end date, Parliamentarian is still in HoC so put today as end date.# If you are very curious, look up difference between ifelse() and if_else()parliamentarians <- parliamentarians |>mutate(date_ended =if_else(, ymd(today()), date_ended))# Keep only those parties, let's not keep the small parties / indpsnames_keep <-c("Conservative", "Conservative Party of Canada", "Liberal Party of Canada", "New Democratic Party", "Progressive Conservative Party")# Keep only variables we need# Delete rows where missing valuesparliamentarians <- parliamentarians |>filter(party %in% names_keep) |>select(Name, Gender, party, date_started, date_ended, province) |>na.omit()##### END OF CLEANING# Make Figure 1ggplot(parliamentarians,aes(x=Gender)) +geom_bar()
# Save Figure 1#ggsave("~/Desktop/figure1.png")to_plot <- parliamentarians |>group_by(Gender) |>summarise(count =n())ggplot(to_plot,aes(x=Gender,y=count)) +geom_bar(stat='identity')