GDPR Fines

h/t to Bob Rudis for sharing the data source, and to Roel Hogervorst for the guide to scraping this data. He provided the bulk of the scraping code, and I added bit of additional data cleaning. The data this week comes from Privacy Affairs.

I have also included all the raw text (gdpr_text.tsv) for the actual GDPR legal documents, in case someone was interested in parsing through them or using them along with the violations.

Per Wikipedia GDPR is:

The General Data Protection Regulation (EU) 2016/679 (GDPR) is a regulation in EU law on data protection and privacy in the European Union (EU) and the European Economic Area (EEA). It also addresses the transfer of personal data outside the EU and EEA areas. The GDPR aims primarily to give control to individuals over their personal data and to simplify the regulatory environment for international business by unifying the regulation within the EU.[1] Superseding the Data Protection Directive 95/46/EC, the regulation contains provisions and requirements related to the processing of personal data of individuals (formally called data subjects in the GDPR) who reside in the EEA, and applies to any enterprise—regardless of its location and the data subjects' citizenship or residence—that is processing the personal information of data subjects inside the EEA.

Get the data here

# Get the Data

gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')
gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv')

# Or read in with tidytuesdayR package (https://github.com/dslc-io/tidytuesdayR)
# PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE the tidytuesdayR version after Jan 2020.

# Either ISO-8601 date or year/week works!

# Install via pak::pak("dslc-io/tidytuesdayR")

tuesdata <- tidytuesdayR::tt_load('2020-04-21')
tuesdata <- tidytuesdayR::tt_load(2020, week = 17)


gdpr_violations <- tuesdata$gdpr_violations

Data Dictionary

`gdpr_violations.tsv`

variable	class	description
id	integer	Idetifier for fine/violation
picture	character	SVG image of violation country flag
name	character	Name of country where violation was enforced
price	integer	Fine price in Euros (€)
authority	character	Authority that enacted the violation
date	character	Date of violation
controller	character	Controller of data - the violator
article_violated	character	Specific GDPR Article violated (see the `gdpr_text.tsv` data for specifics)
type	character	Type of violation
source	character	Original source (URL) of fine data
summary	character	Summary of violation

`gdpr_text.tsv`

variable	class	description
chapter	double	GDPR Chapter Number
chapter_title	character	Chapter title
article	double	GDPR Article number
article_title	character	Article title
sub_article	double	Sub article number
gdpr_text	character	Raw text of article/subarticle
href	character	URL to the raw text itself

Cleaning Script

library(tidyverse)
library(rvest)

# Note the following code was adapted from
# https://blog.rmhogervorst.nl/blog/2020/04/08/scraping-gdpr-fines/

link <- "https://www.privacyaffairs.com/gdpr-fines/"
page <- read_html(link)


temp <- page %>% html_nodes("script") %>%
  .[9] %>%
  rvest::html_text()

ends <- str_locate_all(temp, "\\]")
starts <- str_locate_all(temp, "\\[")

table1 <- temp %>%
  stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>%
  str_remove_all("\\\n") %>%
  str_remove_all("\\\r") %>%
  jsonlite::fromJSON() %>%
  as_tibble() %>%
  mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))


table2 <- temp %>%
  stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>%
  str_remove_all("\\\n") %>%
  str_remove_all("\\\r") %>%
  jsonlite::fromJSON() %>%
  as_tibble() %>%
  mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))


all_df <- bind_rows(table1, table2) %>%
  janitor::clean_names() %>%
  mutate(
    authority = str_remove(authority, "\t"),
    article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
           str_remove('</a>'),
    article_violated = str_replace_all(article_violated, ", Art", "|Art"),
    type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
      str_remove('</a>')
           )

# most frequent articles violated
all_df %>%
  separate_rows(article_violated, sep = "\\|") %>%
  count(article_violated, sort = T)

all_df %>%
  write_tsv("2020/2020-04-21/gdpr_violations.tsv")


# Getting the actual article text -----------------------------------------

raw_article <- "https://gdpr-info.eu/" %>%
  read_html()

# Get all the urls for specific articles/chapters
gdpr_href <- raw_article %>%
  html_node(xpath = '//*[@id="tablepress-12"]') %>%
  html_nodes("a") %>%
  html_attr("href")

# pull the titles as well
gdpr_titles <- raw_article %>%
  html_node(xpath = '//*[@id="tablepress-12"]') %>%
  html_nodes("a") %>%
  html_attr("data-title")

# pull the numbers of article/chapters
gdpr_numbers <- raw_article %>%
  html_node(xpath = '//*[@id="tablepress-12"]') %>%
  html_nodes("a") %>%
  html_text()

# put it all into a df
gdpr_df <- tibble(
  article = gdpr_numbers,
  title = str_trim(gdpr_titles),
  href = gdpr_href
)

# Tidy up the data, create chapters vs articles
clean_gdpr <- gdpr_df %>%
  mutate(chapter = if_else(str_length(article) > 3, article, NA_character_),
         chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>%
  fill(chapter, chapter_title) %>%
  filter(!str_detect(article, "Chapter")) %>%
  mutate(article = as.double(article)) %>%
  filter(!is.na(article)) %>%
  select(starts_with("chapter"), article, article_title = title, href)

clean_gdpr

# LONG running outcome
# Get all the raw html from each of the urls for each article
all_articles <- clean_gdpr %>%
  mutate(raw_html = map(href, read_html))

# function to take raw html and turn it into text for that specific article
get_gdpr_text <- function(html_in){

  test_var <- html_in %>%
    html_node(".entry-content") %>%
    html_nodes("ol") %>%
    html_text()

  if (length(test_var) == 0){
   text <- html_in %>%
     html_node(".entry-content > p") %>%
     html_text() %>%
     str_remove("^[:digit:]")
  } else {
    text <- html_in %>%
      html_node(".entry-content") %>%
      html_nodes("ol") %>%
      html_text() %>%
      .[[1]] %>%
      str_replace_all(";\n", "\t") %>%
      str_replace_all(":\n", "\t") %>%
      str_split("\n") %>%
      .[[1]] %>%
      .[. != ""] %>%
      str_replace_all("\t", "\n") %>%
      str_remove("^[:digit:]")
  }


  text

}

# Test
get_gdpr_text(read_html("http://gdpr-info.eu/art-2-gdpr/"))

# unnest the list column of text
clean_articles <- all_articles %>%
  mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>%
  unnest_longer(gdpr_text)

# final dataframe
final_articles <- clean_articles %>%
  group_by(article) %>%
  mutate(sub_article = row_number()) %>%
  relocate(sub_article, .after = "article_title") %>%
  relocate(gdpr_text, .after = "sub_article") %>%
  ungroup() %>%
  mutate(chapter = str_extract(chapter, "[:digit:]+")) %>%
  mutate_at(vars(chapter, article, sub_article), as.double) %>%
  select(-raw_html)

final_articles %>% view()

write_tsv(final_articles, "2020/2020-04-21/gdpr_text.tsv")