# ---- Dataset integrity tests ---- # Verify dimensions, column names, key uniqueness, and types # for all seven built-in datasets. # -- legislators ------------------------------------------------------- test_that("legislators has expected structure", { data(legislators, envir = environment()) expect_s3_class(legislators, "data.frame") expect_equal(ncol(legislators), 15) expect_true(nrow(legislators) > 900) expected_cols <- c( "member_id", "assembly", "name", "name_hanja", "name_eng", "party", "party_elected", "district", "district_type", "committees", "gender", "birth_date", "seniority", "n_bills", "n_bills_lead" ) expect_named(legislators, expected_cols) }) test_that("legislators key columns are valid", { data(legislators, envir = environment()) # member_id + assembly should be unique key <- paste(legislators$member_id, legislators$assembly) expect_equal(length(key), length(unique(key))) # assembly is one of 20, 21, 22 expect_true(all(legislators$assembly %in% c(20, 21, 22))) # gender is M or F expect_true(all(legislators$gender %in% c("M", "F"))) # district_type is one of two values expect_true(all(legislators$district_type %in% c("constituency", "proportional"))) }) # -- bills ------------------------------------------------------------- test_that("bills has expected structure", { data(bills, envir = environment()) expect_s3_class(bills, "data.frame") expect_equal(ncol(bills), 9) expect_true(nrow(bills) > 60000) expected_cols <- c( "bill_id", "bill_no", "assembly", "bill_name", "committee", "propose_date", "result", "proposer", "proposer_id" ) expect_named(bills, expected_cols) }) test_that("bills key columns are valid", { data(bills, envir = environment()) expect_equal(length(bills$bill_id), length(unique(bills$bill_id))) expect_true(all(bills$assembly %in% c(20, 21, 22))) expect_s3_class(bills$propose_date, "Date") }) # -- wealth ------------------------------------------------------------ test_that("wealth has expected structure", { data(wealth, envir = environment()) expect_s3_class(wealth, "data.frame") expect_equal(ncol(wealth), 14) expect_true(nrow(wealth) > 2900) expected_cols <- c( "member_id", "year", "name", "total_assets", "total_debt", "net_worth", "real_estate", "building", "land", "deposits", "stocks", "n_properties", "has_seoul_property", "has_gangnam_property" ) expect_named(wealth, expected_cols) }) test_that("wealth key columns are valid", { data(wealth, envir = environment()) # member_id + year should be unique key <- paste(wealth$member_id, wealth$year) expect_equal(length(key), length(unique(key))) expect_true(all(wealth$year >= 2015 & wealth$year <= 2025)) expect_type(wealth$has_seoul_property, "logical") expect_type(wealth$has_gangnam_property, "logical") }) # -- seminars ---------------------------------------------------------- test_that("seminars has expected structure", { data(seminars, envir = environment()) expect_s3_class(seminars, "data.frame") expect_equal(ncol(seminars), 18) expect_true(nrow(seminars) > 5900) expected_cols <- c( "name", "member_id", "year", "assembly", "party", "camp", "seniority", "n_seminars", "n_cross_party", "cross_party_ratio", "avg_coalition_size", "is_governing", "is_female", "is_proportional", "is_seoul", "province", "total_terms", "n_bills_led" ) expect_named(seminars, expected_cols) }) test_that("seminars key columns are valid", { data(seminars, envir = environment()) expect_true(all(seminars$assembly %in% 17:22)) expect_type(seminars$is_governing, "logical") expect_true(all(seminars$cross_party_ratio >= 0 & seminars$cross_party_ratio <= 1, na.rm = TRUE)) }) # -- speeches ---------------------------------------------------------- test_that("speeches has expected structure", { data(speeches, envir = environment()) expect_s3_class(speeches, "data.frame") expect_equal(ncol(speeches), 9) expect_true(nrow(speeches) > 15000) expected_cols <- c( "assembly", "date", "committee", "speaker", "role", "speaker_name", "member_id", "speech_order", "speech" ) expect_named(speeches, expected_cols) }) test_that("speeches key columns are valid", { data(speeches, envir = environment()) expect_true(all(speeches$assembly == 22)) expect_s3_class(speeches$date, "Date") valid_roles <- c( "legislator", "chair", "minister", "vice_minister", "senior_bureaucrat", "agency_head", "witness", "expert_witness", "nominee", "minister_nominee", "testifier", "public_corp_head", "broadcasting", "committee_staff" ) expect_true(all(speeches$role %in% valid_roles)) }) # -- votes ------------------------------------------------------------- test_that("votes has expected structure", { data(votes, envir = environment()) expect_s3_class(votes, "data.frame") expect_equal(ncol(votes), 13) expect_true(nrow(votes) > 7900) expected_cols <- c( "bill_id", "bill_no", "bill_name", "assembly", "committee", "vote_date", "result", "bill_type", "total_members", "voted", "yes", "no", "abstain" ) expect_named(votes, expected_cols) }) test_that("votes key columns are valid", { data(votes, envir = environment()) expect_true(all(votes$assembly %in% c(20, 21, 22))) expect_s3_class(votes$vote_date, "Date") # yes + no + abstain should not exceed voted expect_true(all(votes$yes + votes$no + votes$abstain <= votes$voted + 1, na.rm = TRUE)) }) # -- roll_calls -------------------------------------------------------- test_that("roll_calls has expected structure", { data(roll_calls, envir = environment()) expect_s3_class(roll_calls, "data.frame") expect_equal(ncol(roll_calls), 8) expect_true(nrow(roll_calls) > 360000) expected_cols <- c( "bill_id", "assembly", "member_name", "member_id", "party", "district", "vote", "vote_date" ) expect_named(roll_calls, expected_cols) }) test_that("roll_calls key columns are valid", { data(roll_calls, envir = environment()) expect_true(all(roll_calls$assembly == 22)) expect_s3_class(roll_calls$vote_date, "Date") # member_id + bill_id should be unique key <- paste(roll_calls$member_id, roll_calls$bill_id) expect_equal(length(key), length(unique(key))) }) # -- Cross-dataset join keys ------------------------------------------- test_that("join keys are compatible across datasets", { data(legislators, envir = environment()) data(wealth, envir = environment()) data(bills, envir = environment()) data(votes, envir = environment()) data(roll_calls, envir = environment()) # wealth member_ids should largely exist in legislators overlap <- mean(wealth$member_id %in% legislators$member_id, na.rm = TRUE) expect_true(overlap > 0.5) # roll_calls member_ids should overlap with legislators rc_ids <- unique(roll_calls$member_id) leg_ids <- unique(legislators$member_id[legislators$assembly == 22]) overlap_rc <- mean(rc_ids %in% leg_ids) expect_true(overlap_rc > 0.8) })