test_that("ws dropped by default", { df <- read_csv(I("x\n a \n b\n")) expect_equal(df$x, c("a", "b")) }) test_that("trim_ws = FALSE keeps ws", { df <- read_csv(I("x\n a\nb \n"), trim_ws = FALSE) expect_equal(df$x, c(" a", "b ")) }) test_that("trim_ws = TRUE trims spaces and tabs", { df <- read_csv(I("x\n a\n\tb \t\n"), trim_ws = TRUE) expect_equal(df$x, c("a", "b")) }) # Encoding ---------------------------------------------------------------- test_that("locale encoding affects parsing", { x <- c("août", "élève", "ça va") # expect_equal(Encoding(x), rep("UTF-8", 3)) y <- iconv(x, "UTF-8", "latin1") # expect_equal(Encoding(x), rep("latin1", 3)) fr <- locale("fr", encoding = "latin1") z <- parse_character(y, locale = fr) # expect_equal(Encoding(z), rep("UTF-8", 3)) # identical coerces encodings to match, so need to compare raw values as_raw <- function(x) lapply(x, charToRaw) expect_identical(as_raw(x), as_raw(z)) }) test_that("Unicode Byte order marks are stripped from output", { # UTF-8 expect_equal( charToRaw(read_lines( as.raw(c( 0xef, 0xbb, 0xbf, # BOM 0x41, # A 0x0A # newline )) )), as.raw(0x41) ) # UTF-16 Big Endian expect_equal( charToRaw(read_lines( as.raw(c( 0xfe, 0xff, # BOM 0x41, # A 0x0A # newline )) )), as.raw(0x41) ) # UTF-16 Little Endian expect_equal( charToRaw(read_lines( as.raw(c( 0xff, 0xfe, # BOM 0x41, # A 0x0A # newline )) )), as.raw(0x41) ) # UTF-32 Big Endian expect_equal( charToRaw(read_lines( as.raw(c( 0x00, 0x00, 0xfe, 0xff, # BOM 0x41, # A 0x0A # newline )) )), as.raw(0x41) ) # UTF-32 Little Endian expect_equal( charToRaw(read_lines( as.raw(c( 0xff, 0xfe, 0x00, 0x00, # BOM 0x41, # A 0x0A # newline )) )), as.raw(0x41) ) # Vectors shorter than the BOM are handled safely expect_equal( charToRaw(read_lines( as.raw(c(0xef, 0xbb)) )), as.raw(c(0xef, 0xbb)) ) expect_equal( charToRaw(read_lines( as.raw(c(0xfe)) )), as.raw(c(0xfe)) ) expect_equal( charToRaw(read_lines( as.raw(c(0xff)) )), as.raw(c(0xff)) ) })