context("remove XML tags and change umlauts style") test_that("removeXML", { #Sys.setlocale('LC_ALL','C') text <- c("abaabcaa", "ab&dgv;abc", "äÄöÖüÜß", "aa", "aab", "bc") tmp <- c("e4","c4","f6","d6","fc","dc","df") tmp <- paste0(sapply(tmp, function(x)eval(parse(text = paste0("'\\u", x, "'")))), collapse = "") text2 <- c("abaabcaa", "ab&dgv;abc", tmp, "aa", "aab", "bc") expect_equal(removeXML(x=text2), c("aba ab caa","ab&dgv;abc","\UE4\UC4\UF6\UD6\UFC\UDC\UDF","aa","aab","bc")) expect_equal(removeHTML(x=text, symbolList = 1, dec=FALSE, hex=FALSE, delete = FALSE), text2) umlauts <- c("aba ab caa","ab&dgv;abc","\UE4\UC4\UF6\UD6\UFC\UDC\UDF","aa","aab","bc") exp1 <- c("aba ab caa", "ab&dgv;abc", "aeAeoeOeueUess", "aa", "aab", "bc") expect_equal(removeUmlauts(x=umlauts), exp1) x <- c("øøø") expect_equal(removeHTML(x=x, symbolList = 1, dec=TRUE, hex=FALSE, entity=FALSE, delete = FALSE), "ø\UF8ø") expect_equal(removeHTML(x=x, symbolList = 1, dec=FALSE, hex=TRUE, entity=FALSE, delete = TRUE), "\UF8") expect_equal(removeHTML(x=x, symbolList = 1, dec=FALSE, hex=FALSE, entity=TRUE, delete = TRUE), "\UF8") x <- c("øøøøøø") expect_equal(removeHTML(x=x, symbolList = 1, dec=TRUE, hex=FALSE, entity=FALSE, delete = TRUE), "\UF8\UF8\UF8") expect_equal(removeHTML(x=x, symbolList = 1, dec=FALSE, hex=TRUE, entity=FALSE, delete = TRUE), "\UF8\UF8\UF8") ISOtest <- sort(unique(as.vector(ISO8859())))[-1] ISOtest2 <- sapply(ISOtest, function(x)eval(parse(text = paste0("'\\u", x, "'")))) ISOtestSymbols <- toupper(paste0(as.hexmode(c(32:64,91:96,123:126,160:191,215,247, 818, 8194:8222, 8254, 8291, 8364, 8417, 8470)))) # Vector of Symbols ISOtestSymbols <- ISOtest %in% gsub(pattern="^0*", replacement="", ISOtestSymbols) ISOtest2Symbols <- ISOtest2 ISOtest2Symbols[ISOtestSymbols] <- "" ISOtestDec <- paste0("&#", strtoi(ISOtest, base=16L), ";") ISOtestHex <- paste0("&#x", ISOtest, ";") ISOtestEnt <- namedEntity()[match(ISOtest, namedEntity()[,2]),1] EntNA <- is.na(ISOtestEnt) expect_equal(removeHTML(x=ISOtestDec, symbolList = c(1:11,13:16), dec=TRUE, hex=FALSE, entity=FALSE, symbols=TRUE), unname(ISOtest2)) expect_equal(removeHTML(x=ISOtestHex, symbolList = c(1:11,13:16), dec=FALSE, hex=TRUE, entity=FALSE, symbols=TRUE), unname(ISOtest2)) expect_equal(removeHTML(x=ISOtestEnt, symbolList = c(1:11,13:16), dec=FALSE, hex=FALSE, entity=TRUE, symbols=TRUE)[!EntNA], unname(ISOtest2)[!EntNA]) expect_equal(removeHTML(x=ISOtestDec, symbolList = c(1:11,13:16), dec=TRUE, hex=FALSE, entity=FALSE), unname(ISOtest2Symbols)) ## lists testlist <- list(ID1=c("a", "b"), ID2=NULL, ID3=c("", "ä \UE4")) expect_equal(removeXML(testlist), list(ID1=c("a", "b"), ID3=c("", "ä \UE4"))) expect_equal(removeHTML(testlist), list(ID1=c("a", "b"), ID3=c("", "\UE4 \UE4"))) expect_equal(removeUmlauts(testlist), list(ID1=c("a", "b"), ID3=c("", "ä ae"))) }) ## text <- "ÄÖ" ## Encoding(text) ## text <- gsub(pattern="Ä", replacement="\u00C4", x=text) ## Encoding(text) ## text <- gsub(pattern="Ö", replacement="\u00D6", x=text) ## text ## text=="\u00C4\u00D6" ## Encoding(text) <- "UTF-8" ## text ## text=="\u00C4\u00D6" ## text <- gsub(pattern="Ä", replacement="\u00C4", x=text, useBytes=TRUE) ## text <- gsub(pattern="Ö", replacement="\u00D6", x=text, useBytes=TRUE)