# Test UTF-8, separate to avoid issues with platforms that don't support it library(fansi) unitizer_sect("substr", { term.cap <- c('bright', '256', 'truecolor') lorem.cn.pieces <- substr(rep(lorem.cn, 5), c(1, 11, 21, 31), c(10, 15, 22, 45)) lorem.cn.col.1 <- paste0( red, lorem.cn.pieces[1], inv, lorem.cn.pieces[2], grn.bg, lorem.cn.pieces[3], rgb.und, lorem.cn.pieces[4], end ) lor.cn.c.1.5 <- rep(lorem.cn.col.1, 5) starts <- seq(1, 17, 4) ends <- starts + 3 substr2_ctl(lor.cn.c.1.5, starts, ends, term.cap=term.cap) # These are all six chars wide, but look different due to different width # characters lorem.cn.col.2 <- paste0( red, lorem.cn.pieces[1], "hello", inv, lorem.cn.pieces[2], " there ", grn.bg, lorem.cn.pieces[3], rgb.und, lorem.cn.pieces[4], end ) lor.cn.c.2.5 <- rep(lorem.cn.col.2, 5) starts <- seq(1, by=6, length.out=5) ends <- starts + 5 substr2_ctl(lor.cn.c.2.5, starts, ends, term.cap=term.cap) substr2_sgr(lor.cn.c.2.5, starts, ends, term.cap=term.cap) starts <- seq(1, by=12, length.out=5) ends <- starts + 11 substr2_ctl(lor.cn.c.2.5, starts, ends, type='width', term.cap=term.cap) # with colors that actually work on an OSX terminal lorem.cn.col.4 <- paste0( red, lorem.cn.pieces[1], "hello", inv, lorem.cn.pieces[2], " there ", grn.bg, lorem.cn.pieces[3], rgb.und.256, lorem.cn.pieces[4], end ) lor.cn.c.4.5 <- rep(lorem.cn.col.4, 5) substr2_ctl(lor.cn.c.4.5, starts, ends, type='width') # All wide characters even number of chars apart lorem.cn.col.3 <- paste0( red, lorem.cn.pieces[1], "helloo", inv, lorem.cn.pieces[2], " world! ", grn.bg, lorem.cn.pieces[3], rgb.und, lorem.cn.pieces[4], end ) lor.cn.c.3.5 <- rep(lorem.cn.col.3, 5) starts <- seq(1, by=12, length.out=5) ends <- starts + 10 ends[2] <- 24 # This is a bit of an accidental one, but it should be the case that the # second line has two extra single width characters because all the others are # losing the last character b/c we're ending in the middle, width wise substr2_ctl(lor.cn.c.3.5, starts, ends, type='width', term.cap=term.cap) # and now we grab those missing chars by allowing the round to happen substr2_ctl( lor.cn.c.3.5, starts, ends, type='width', round='both', term.cap=term.cap ) # jagged first one leads short, second long starts <- seq(1, by=7, length.out=5) ends <- starts + 8 substr2_ctl(lor.cn.c.1.5, starts, ends, type='width', term.cap=term.cap) substr2_ctl( lor.cn.c.1.5, starts, ends, type='width', round='stop', term.cap=term.cap ) # don't support byte encoded strings bytes <- "\xC0\xB1\xF0\xB1\xC0\xB1\xC0\xB1" Encoding(bytes) <- "bytes" # need trycatch due to instability from C level `error` call in getting the # function call tce(substr_ctl(bytes, 2, 3)) # Let's try a latin one string latin <- "H\xE9llo W\xD6rld!" Encoding(latin) <- "latin1" latin.utf8 <- substr_ctl(latin, 1, 9) latin.utf8 Encoding(latin.utf8) # Start/Stop rounding - examples rnd.1 <- "MnW" Encoding(rnd.1) <- "UTF-8" substr2_ctl(rnd.1, 2, 4, type='width', round='start') substr2_ctl(rnd.1, 2, 4, type='width', round='stop') substr2_ctl(rnd.1, 2, 4, type='width', round='neither') substr2_ctl(rnd.1, 2, 4, type='width', round='both') # Start/Stop rounding - end edge cases rnd.2 <- "MW" Encoding(rnd.2) <- "UTF-8" substr2_ctl(rnd.2, 2, 3, type='width', round='start') substr2_ctl(rnd.2, 2, 3, type='width', round='stop') substr2_ctl(rnd.2, 1, 2, type='width', round='start') substr2_ctl(rnd.2, 1, 2, type='width', round='stop') substr2_ctl(rnd.2, 3, 4, type='width', round='start') substr2_ctl(rnd.2, 3, 4, type='width', round='stop') }) unitizer_sect("rounding", { # handling of subsetting when we end up in middle of wide display characters substr2_ctl(lorem.cn.col.2, 1, 2, type='width') substr2_ctl(lorem.cn.col.2, 1, 3, type='width') substr2_ctl(lorem.cn.col.2, 2, 3, type='width') substr2_ctl(lorem.cn.col.2, 2, 4, type='width') substr2_ctl(lorem.cn.col.2, 3, 4, type='width') substr2_ctl(lorem.cn.col.2, 1, 2, type='width', round='stop') substr2_ctl(lorem.cn.col.2, 1, 3, type='width', round='stop') substr2_ctl(lorem.cn.col.2, 2, 3, type='width', round='stop') substr2_ctl(lorem.cn.col.2, 2, 4, type='width', round='stop') substr2_ctl(lorem.cn.col.2, 3, 4, type='width', round='stop') substr2_ctl(lorem.cn.col.2, 1, 2, type='width', round='both') substr2_ctl(lorem.cn.col.2, 1, 3, type='width', round='both') substr2_ctl(lorem.cn.col.2, 2, 3, type='width', round='both') substr2_ctl(lorem.cn.col.2, 2, 4, type='width', round='both') substr2_ctl(lorem.cn.col.2, 3, 4, type='width', round='both') substr2_ctl(lorem.cn.col.2, 1, 2, type='width', round='neither') substr2_ctl(lorem.cn.col.2, 1, 3, type='width', round='neither') substr2_ctl(lorem.cn.col.2, 2, 3, type='width', round='neither') substr2_ctl(lorem.cn.col.2, 2, 4, type='width', round='neither') substr2_ctl(lorem.cn.col.2, 3, 4, type='width', round='neither') substr2_ctl(lorem.cn.col.2, 2, 3, type='width', round='neither', terminate=FALSE) }) unitizer_sect("multi-elem", { # Due to preservation of state issues, need to make sure works well with # more than one value lor.cn.2.2 <- rep(lorem.cn.col.2, 2) substr2_ctl(lor.cn.2.2, c(1,3), c(2,4), type='width') substr2_ctl(lor.cn.2.2, c(2,4), c(2,4), type='width') }) unitizer_sect("zero width combining", { combo <- "hello\u0300\u035c world" Encoding(combo) <- "UTF-8" substr2_ctl(combo, 1, 5, type='width') substr2_ctl(combo, 5, 8, type='width') substr2_ctl(rep(combo, 2), c(1, 5), c(5, 8), type='width') nchar_ctl(combo, type='width') nchar_ctl(combo, type='graphemes') # zero width with double width combo3 <- paste0(substr(lorem.cn.pieces[1], 1, 2), '\u0300') Encoding(combo3) <- "UTF-8" substr2_ctl(combo3, 3, 4, type='width') substr2_ctl(combo3, 2, 4, type='width') substr2_ctl(combo3, 4, 4, type='width') substr2_ctl(combo3, 4, 5, type='width') # start with diacritic combo4 <- paste0('\u0300hello') substr2_ctl(combo4, 1, 1, type='width') # no diacritic substr2_ctl(combo4, 1, 1) # diacritic only substr2_ctl(combo4, 0, 1, type='width') # with diacritic substr2_ctl(combo4, 0, 0, type='width') # empty }) unitizer_sect("Corner cases", { utf8.bad <- "hello \xF0 world, goodnight moon" Encoding(utf8.bad) <- 'UTF-8' substr_ctl(utf8.bad, 1, 7) substr_ctl(utf8.bad, 5, 10) # Need to use `tryCatch` because the warnings vascillate for no rhyme or # reason between showing the call and not. Seems to be triggered by # re-installing package. now we're stuck with the try business to circumvent # that variability. tce(substr2_ctl(utf8.bad, 1, 7, type='width')) # # need to remove for changes in R3.6.0 # substr2_ctl(utf8.bad, 1, 7, type='width', warn=FALSE) tce(substr2_ctl(utf8.bad, 5, 10, type='width')) # # need to remove for changes in R3.6.0 # substr2_ctl(utf8.bad, 5, 10, type='width', warn=FALSE) # ends early chrs.2 <- "hello\xee" Encoding(chrs.2) <- "UTF-8" tce(substr2_ctl(chrs.2, 1, 10, type='width')) # # need to remove for changes in R3.6.0 # substr2_ctl(chrs.2, 1, 10, type='width', warn=FALSE) # bad utf8 in SGR and CSI bad.u <- c("A\033[31;\x80mB", "A\033[31;\x80pB") Encoding(bad.u) <- "UTF-8" substr_ctl(bad.u[1], 0, 3) substr_ctl(bad.u[2], 0, 3) # boundaries b.test <- c( "\uc0f6\ubed9", "\u0301a\ubed9", # leading diacritic "\ubed9\u0301a", # trailing diacritic "\ubed9a\u0301" # really trailing diacritic ) identical(substr_ctl(b.test, 0, 3), substr(b.test, 0, 3)) identical(substr_ctl(b.test, 0, 2), substr(b.test, 0, 2)) identical(substr_ctl(b.test, 1, 2), substr(b.test, 1, 2)) identical(substr_ctl(b.test, 0, 4), substr(b.test, 0, 4)) identical(substr_ctl(b.test, 4, 4), substr(b.test, 4, 4)) b.t.c <- sprintf("\033[43m%s\033[49m", b.test) substr_ctl(b.t.c, 0, 0) substr_ctl(b.t.c, 0, 2) substr_ctl(b.t.c, 1, 2) substr_ctl(b.t.c, 0, 4) substr_ctl(b.t.c, 4, 4) substr2_ctl(b.t.c, 0, 0, type='width') substr2_ctl(b.t.c, 0, 2, type='width') substr2_ctl(b.t.c, 1, 4, type='width') substr2_ctl(b.t.c, 0, 5, type='width') substr2_ctl(b.t.c, 5, 5, type='width') substr_ctl(b.t.c, 0, 4, terminate=FALSE) substr2_ctl(b.t.c, 1, 4, terminate=FALSE, type='width') }) unitizer_sect("nchar", { chr.dia <- 'A\u030A' nchar_ctl(chr.dia) nchar(chr.dia) # for reference, base gets it wrong too nchar_ctl(chr.dia, type='width') # Wide chars w1 <- "\u4E00\u4E01\u4E03" w2 <- "\u4E00\u4E01\u4E03" nchar_ctl(w1) nchar_ctl(w2, type='width') nchar_ctl(w2, type='graphemes') nchar_ctl(w2, type='bytes') # Allow NA for illegal sequences hello.illegal <- c("hello", "\xF0", "\xF0aaaa") Encoding(hello.illegal) <- 'UTF-8' nchar_ctl(hello.illegal) nchar_ctl(hello.illegal, allowNA=TRUE) # nzchar doesn't care about multi-byte illegal nzchar_ctl(hello.illegal) # escapes mixed in esc.1 <- sprintf( "hello \033[31mworld\033[m%s\033[48;5;123m blahs \033[m%s", "\u76F4\u8349", "\u56FA\u55F0\u5F8C" ) Encoding(esc.1) <- 'UTF-8' nchar_ctl(esc.1) nchar_ctl(esc.1, type='width') nchar_ctl(esc.1, type='bytes') nzchar_ctl(esc.1) esc.2 <- "\n\r\033P\033[31m\a" nchar_ctl(c(esc.1, esc.2, 'hello'), warn=FALSE) # _sgr esc.4 <- c(sprintf("\033[31m%shello", w1), NA, hello.illegal) nchar_sgr(esc.4, type='width', keepNA=FALSE, warn=FALSE, allowNA=TRUE) nzchar_sgr(esc.4, keepNA=FALSE, warn=FALSE) # _sgr does not strip C0; note R behavior on width of C0-C1 # fluctuating around R4.1 transition so can't test directly. nchar_sgr("\033[31m\thello", type='width') >= nchar_ctl("\033[31m\thello", type='width') # nchar doesn't care about bad bits embedded in escapes ncb <- c("123\033[31\x80m123", "123\033\x80123") Encoding(ncb) <- "UTF-8" nchar_ctl(ncb) }) unitizer_sect("unhandled", { # a bad utf8 string and other bad stuff utf8.bad.0 <- "hello\033\033\033[45p \xF0how wor\ald" Encoding(utf8.bad.0) <- "UTF-8" unhandled_ctl(utf8.bad.0) utf8.bad.1 <- "hello \xF0ho" Encoding(utf8.bad.1) <- "UTF-8" unhandled_ctl(utf8.bad.1) }) unitizer_sect("utf8clen", { # Can't test directly, but we can check what character lenght we get back from # nchar and infer whether things have changed or not # # These tests are designed to start failing if behavior of utf8clen changes # # See src/main/valid_utf8.h # U+0000..U+007F | 00..7F | # U+0080..U+07FF | C2..DF | 80..BF # U+0800..U+0FFF | E0 |*A0..BF*| 80..BF # U+1000..U+CFFF | E1..EC | 80..BF | 80..BF # U+D000..U+D7FF | ED |*80..9F*| 80..BF # U+E000..U+FFFF | EE..EF | 80..BF | 80..BF # U+10000..U+3FFFF | F0 |*90..BF*| 80..BF | 80..BF # U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF # U+100000..U+10FFFF | F4 |*80..8F*| 80..BF | 80..BF chrs <- c( "\xc2\x80", "\xDF\xBF", "\xe0\xA0\x80", "\xE0\xBF\xBF", "\xe1\x80\x80", "\xeC\xbf\xbf", "\xed\x80\x80", "\xed\x9f\xbf", "\xee\x90\x80", "\xef\xbf\xbf", "\xf0\x90\x80\x80", "\xf4\x8f\xbf\xbf", "\xf8\x80\x80\x80\x80", "\xfb\x80\x80\x80\x80", "\xfc\x80\x80\x80\x80\x80", "\xff\x80\x80\x80\x80\x80" ) Encoding(chrs) <- "UTF-8" nchar(chrs, allowNA=TRUE) nchar_ctl(chrs, allowNA=TRUE) # Of the 10xxxxxx variety utf8.bad.2 <- "\xBFaaaaaa" Encoding(utf8.bad.2) <- "UTF-8" nchar(utf8.bad.2, allowNA=TRUE) nchar_ctl(utf8.bad.2, allowNA=TRUE) ## remove for changes in R3.6.0 substr_ctl(utf8.bad.2, 1, 1) }) unitizer_sect("wrap corner cases", { # With UTF8 pre.2 <- "\x1b[32m\xd0\x9f \x1b[0m" ini.2 <- "\x1b[33m\xd1\x80 \x1b[0m" hello.8c <- "hello Привет world" Encoding(pre.2) <- "UTF-8" Encoding(ini.2) <- "UTF-8" Encoding(hello.8c) <- "UTF-8" pre.3 <- "\xd0\x9f " ini.3 <- "\xd1\x80 " Encoding(pre.3) <- "UTF-8" Encoding(ini.3) <- "UTF-8" wrap.csi.4 <- strwrap_ctl(hello.8c, 15, prefix=pre.2, initial=ini.2) wrap.csi.4 utf8.chr <- "\u76F4" strwrap2_ctl(utf8.chr, 1, wrap.always=TRUE) strwrap2_ctl(utf8.chr, 2, wrap.always=TRUE) strwrap2_ctl(utf8.chr, 3, wrap.always=TRUE) strwrap_ctl("lovelyday.", 10) strwrap2_ctl("lovelyday.", 10, wrap.always=TRUE) utf8.bad <- "hello \xF0 world, goodnight moon" Encoding(utf8.bad) <- "UTF-8" strwrap_ctl(utf8.bad, 10) # bad prefix values utf8.bad.2 <- "\xF0" Encoding(utf8.bad.2) <- "UTF-8" tcw(strwrap_ctl("hello world", 6, prefix=utf8.bad.2)) suppressWarnings(strwrap_ctl("hello world", 6, prefix=utf8.bad.2)) # Byte encoded strings not allowed bytes <- "\xC0\xB1\xF0\xB1\xC0\xB1\xC0\xB1" Encoding(bytes) <- "bytes" tce(strwrap_ctl(bytes)) # Encoding captured correctly encstrings <- c("hell\u00F8 world", "hello w\u00F8rld") Encoding(strwrap_ctl(encstrings, 5)) # Caused an infinite loop in one case str.inf <- "\U1F600 \U1F600" strwrap2_ctl(str.inf, 2) }) unitizer_sect("wrap with wide UTF8 and ESC", { wrap.mix <- strwrap_ctl(lorem.mix, 25) wrap.mix # identical( # strwrap(strip_ctl(lorem.mix, "sgr"), 25), strip_ctl(wrap.mix, "sgr") # ) string <- "\033[37;48;5;32m國官方認定的民族現有56個\033[39;49m" Encoding(string) <- "UTF-8" strwrap2_ctl(string, 24, wrap.always=TRUE, pad.end=" ") }) unitizer_sect("issue 54 ctd", { # other issu54 tests are in tohtml.R, but had to move this one here due to the # ellipsis utf-8 character. string3 <- c( "\033[38;5;246m# … with 5 more variables: total_time \033[3m\033[38;5;246m\033[38;5;246m\033[23m, result \033[3m\033[38;5;246m\033[38;5;246m\033[23m, memory \033[3m\033[38;5;246m\033[38;5;246m\033[23m,", "# time \033[3m\033[38;5;246m\033[38;5;246m\033[23m, gc \033[3m\033[38;5;246m\033[38;5;246m\033[23m\033[39m" ) Encoding(string3) <- "UTF-8" fansi::sgr_to_html(html_esc(string3)) # head <- "
"
  # f <- paste0(tempfile(), ".html")
  # writeLines(c(head, fansi::sgr_to_html(string3), "
"), f) # browseURL(f) # unlink(f) # trigger warnings/errors string4 <- c( "wow \033[31m then", "hello\033[\x80;wow", "yo \033[m there", "boom \033[41m" ) Encoding(string4) <- "UTF-8" sgr_to_html(string4) }) unitizer_sect("html_esc", { x <- "\U0001F600" html_esc(c("h&e'l\"lo", "wors", NA, ""), x) })