num_pages <- 412
get_links <- function(page) {
search_url <- str_c("https://www.factcheck.org/the-factcheck-wire/page/", page, "/")
print(search_url)
robotstxt::paths_allowed(search_url)
Sys.sleep(2)
search_html <- tryCatch({
read_html(search_url)
}, error = function(e) {
print("Error with read_html, trying read_html_live")
print(e)
tryCatch({
read_html_live(search_url)
}, error = function(e) {
print("Error with read_html_live")
print(e)
return(NA)
})
})
main <- search_html |> html_elements("main")
links <- main |> html_elements("a") |> html_attr("href") |> unique()
return(links)
}NOT COMPLETE
Link to FactCheck.org Articles
Get All Links from FactCheck.org
all_links <- lapply(1:num_pages, get_links) |> unlist()
# error on page 14
tibble(link = all_links) |> write_csv("./pages/projects/factcheck/data/factcheck_links.csv")Get all article data
get_article_data <- function(url) {
print(url)
robotstxt::paths_allowed(url)
Sys.sleep(2)
html <- tryCatch({
read_html(url)
}, error = function(e) {
print("Error with read_html, trying read_html_live")
print(e)
tryCatch({
read_html_live(url)
}, error = function(e) {
print("Error with read_html_live")
print(e)
return(NA)
})
})
# Header metadata
header <- html |> html_elements("header")
title <- header |> html_elements("h1") |> html_text()
subheader <- header |> html_elements("p")
author <- subheader[1] |> html_elements("a") |> html_text() |> paste(collapse = ", ")
date <- subheader[2] |> html_elements("time") |> html_text()
# Footer Metadata
footer <- html |> html_elements("article") |> html_elements("footer")
categories <- footer |> html_elements(".categories") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
tags <- footer |> html_elements(".post_tag") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
issue <- footer |> html_elements(".issue") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
people <- footer |> html_elements(".person") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
misconceptions <- footer |> html_elements(".misconceptions") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
location <- footer |> html_elements(".location") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
# Article
article <- html |>
html_elements("article") |>
html_elements(".entry-content") |>
html_text2()
# Remove Spanish translation option
em <- html |>
html_elements("article") |>
html_elements(".entry-content") |>
html_elements("em") |>
html_text2() |>
first()
if(!is.na(em) && str_detect(article, "español")) {
article <- article |>
str_remove(em)
}
row <- tibble(
title = as.character(title),
author = as.character(author),
date = as.character(date),
url = as.character(url),
categories = as.character(ifelse(categories == "", NA, categories)),
tags = as.character(ifelse(tags == "", NA, tags)),
issue = as.character(ifelse(issue == "", NA, issue)),
people = as.character(ifelse(people == "", NA, people)),
misconceptions = as.character(ifelse(misconceptions == "", NA, misconceptions)),
location = as.character(ifelse(location == "", NA, location)),
article = as.character(article))
return(row)
}links <- read_csv("./pages/projects/factcheck/data/factcheck_links.csv")
split_links <- split(links, cut(seq(nrow(links)), breaks = 41, labels = FALSE))
for(i in 1:length(split_links)) {
factcheck <- tibble() |>
bind_rows(lapply(split_links[[i]]$link, get_article_data))
factcheck |> write_csv(str_c("./pages/projects/factcheck/data/factcheck_", i, ".csv"))
}
factcheck <- tibble() |>
# bind_rows(scicheck1, scicheck2, scicheck3, scicheck4, scicheck5, scicheck6) |>
mutate(date = as.Date(date, format = "%B %d, %Y"))
scicheck |> write_csv("./data/factcheck.csv")factcheck <- read_csv("./data/factcheck.csv")
added_links <- tibble(link = get_links(1)) |>
select(link) |>
filter(!link %in% scicheck$url)
new_factcheck <- tibble() |>
bind_rows(lapply(added_links$link, get_article_data)) |>
mutate(date = as.Date(date, format = "%B %d, %Y"))
combined_factcheck <- factcheck |> bind_rows(new_factcheck)
combined_factcheck |> arrange(date) |> write_csv("./data/factcheck.csv")