<- 60
num_pages
<- function(page) {
get_links <- str_c("https://www.factcheck.org/scicheck/page/", page, "/")
search_url
print(search_url)
::paths_allowed(search_url)
robotstxt
Sys.sleep(2)
<- tryCatch({
search_html read_html(search_url)
error = function(e) {
}, print("Error with read_html, trying read_html_live")
print(e)
tryCatch({
read_html_live(search_url)
error = function(e) {
}, print("Error with read_html_live")
print(e)
return(NA)
})
})
<- search_html |> html_elements("main")
main
<- main |> html_elements("a") |> html_attr("href") |> unique()
links
return(links)
}
Link to FactCheck.org SciCheck Articles
Link to FactCheck.org SciCheck Articles Data
Get All Links from SciCheck
<- lapply(1:num_pages, get_links) |> unlist()
all_links
tibble(link = all_links) |> write_csv("./pages/projects/scicheck/data/factcheck_scicheck_links.csv")
Get all article data
<- function(url) {
get_article_data print(url)
::paths_allowed(url)
robotstxt
Sys.sleep(2)
<- tryCatch({
html read_html(url)
error = function(e) {
}, print("Error with read_html, trying read_html_live")
print(e)
tryCatch({
read_html_live(url)
error = function(e) {
}, print("Error with read_html_live")
print(e)
return(NA)
})
})
# Header metadata
<- html |> html_elements("header")
header
<- header |> html_elements("h1") |> html_text()
title
<- header |> html_elements("p")
subheader
<- subheader[1] |> html_elements("a") |> html_text() |> paste(collapse = ", ")
author
<- subheader[2] |> html_elements("time") |> html_text()
date
# Footer Metadata
<- html |> html_elements("article") |> html_elements("footer")
footer
<- footer |> html_elements(".categories") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
categories
<- footer |> html_elements(".post_tag") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
tags
<- footer |> html_elements(".issue") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
issue
<- footer |> html_elements(".person") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
people
<- footer |> html_elements(".misconceptions") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
misconceptions
<- footer |> html_elements(".location") |> html_elements("li") |> html_text2() |> paste(collapse = ", ")
location
# Article
<- html |>
article html_elements("article") |>
html_elements(".entry-content") |>
html_text2()
# Remove Spanish translation option
<- html |>
em html_elements("article") |>
html_elements(".entry-content") |>
html_elements("em") |>
html_text2() |>
first()
if(!is.na(em) && str_detect(article, "español")) {
<- article |>
article str_remove(em)
}
<- tibble(
row title = as.character(title),
author = as.character(author),
date = as.character(date),
url = as.character(url),
categories = as.character(ifelse(categories == "", NA, categories)),
tags = as.character(ifelse(tags == "", NA, tags)),
issue = as.character(ifelse(issue == "", NA, issue)),
people = as.character(ifelse(people == "", NA, people)),
misconceptions = as.character(ifelse(misconceptions == "", NA, misconceptions)),
location = as.character(ifelse(location == "", NA, location)),
article = as.character(article))
return(row)
}
<- read_csv("./pages/projects/scicheck/data/factcheck_scicheck_links.csv")
links
<- split(links, cut(seq(nrow(links)), breaks = 6, labels = FALSE))
split_links
for(i in 1:length(split_links)) {
<- tibble() |>
scicheck bind_rows(lapply(split_links[[i]]$link, get_article_data))
|> write_csv(str_c("./pages/projects/scicheck/data/factcheck_scicheck_", i, ".csv"))
scicheck
}
<- read_csv("./pages/projects/scicheck/data/factcheck_scicheck_1.csv")
scicheck1 <- read_csv("./pages/projects/scicheck/data/factcheck_scicheck_2.csv")
scicheck2 <- read_csv("./pages/projects/scicheck/data/factcheck_scicheck_3.csv")
scicheck3 <- read_csv("./pages/projects/scicheck/data/factcheck_scicheck_4.csv")
scicheck4 <- read_csv("./pages/projects/scicheck/data/factcheck_scicheck_5.csv")
scicheck5 <- read_csv("./pages/projects/scicheck/data/factcheck_scicheck_6.csv")
scicheck6
<- tibble() |>
scicheck bind_rows(scicheck1, scicheck2, scicheck3, scicheck4, scicheck5, scicheck6) |>
mutate(date = as.Date(date, format = "%B %d, %Y"))
|> write_csv("./data/factcheck_scicheck.csv") scicheck
<- read_csv("./data/factcheck_scicheck.csv")
scicheck
<- tibble(link = get_links(1)) |>
added_links select(link) |>
filter(!link %in% scicheck$url)
<- tibble() |>
new_scicheck bind_rows(lapply(added_links$link, get_article_data)) |>
mutate(date = as.Date(date, format = "%B %d, %Y"))
<- scicheck |> bind_rows(new_scicheck)
combined_scicheck
|> arrange(date) |> write_csv("./data/factcheck_scicheck.csv") combined_scicheck