# Copyright (C) 2023 Brodie Gaslam # # This file is part of "vetr - Trust, but Verify" # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # Go to for a copy of the license. library(vetr) unitizer_sect("Basic Tests", { vetr:::len_chr_len(1000L) vetr:::len_chr_len(1L) vetr:::len_chr_len(1234567890L) len0 <- 1234567890000000000000000 vetr:::len_chr_len(len0) vetr:::len_as_chr(1000L) vetr:::len_as_chr(1L) vetr:::len_as_chr(1234567890L) vetr:::len_as_chr(len0) identical(vetr:::strmlen(lorem), nchar(lorem)) vetr:::strmlen(lorem, 100L) identical(vetr:::strmcpy(lorem), lorem) vetr:::strmcpy("") vetr:::strmcpy(lorem, 20L) identical(nchar(vetr:::strmcpy(lorem, 20L)), 20L) vetr:::smprintf2("%s %s", lorem, lorem, 10L) vetr:::smprintf2("%s %s hello world there", lorem, lorem, 10L) lorem # make sure lorem unchanged vetr:::ucfirst("hello WORLD") vetr:::lcfirst("HELLO world") vetr:::strbullet( c( "hello world\nhow are things today", "once upon a time\nlived a funny duck" ) ) vetr:::strbullet("hello\nblah\n", bullet=" - ", ctd=" ") vetr:::strbullet(1:10) # error vetr:::collapse(letters[1:5]) vetr:::collapse(letters[1:5], sep="\n") vetr:::collapse(character()) }) unitizer_sect("numbers as character", { vetr:::num_as_chr(100) vetr:::num_as_chr(100.01) # switch ot scientific num0 <- 1e9 + 0.1 num1 <- -1e9 - 0.1 # need to sub leading zeros before 'e' due to different display in # windows vs nix # sub("e[+-]?\\K0*", "", vetr:::num_as_chr(num0), perl=TRUE) sub("e[+-]?\\K0*", "", vetr:::num_as_chr(num1), perl=TRUE) vetr:::num_as_chr(num0, as.int=TRUE) vetr:::num_as_chr(num1, as.int=TRUE) num2 <- 1e9 - 0.1 num3 <- -(1e9 - 0.1) vetr:::num_as_chr(num2) vetr:::num_as_chr(num3) # corner cases vetr:::num_as_chr(NA) vetr:::num_as_chr(NaN) vetr:::num_as_chr(Inf) vetr:::num_as_chr(-Inf) }) unitizer_sect("smprintf6", { vetr:::smprintf6( "%s %s %s %s %s %s", "a", "bb", "ccc", "dddd", "eeeee", "ffffff" ) vetr:::smprintf6( "%s %s %s %s %s %s", "a", "bb", "ccc", "dddd", "eeeee", "ffffff", 10L ) vetr:::smprintf6( "%s %s %s %s %s %s", "a", "bb", "ccc", "dddd", "eeeee", "ffffff", 18L ) # bad format strings vetr:::smprintf6( "%s %s %s %s", "a", "bb", "ccc", "dddd", "eeeee", "ffffff" ) # bad format strings2 - this one reads memeory it shouldn't # vetr:::smprintf6( # "%s %s %s %s %s %s %s %s", "a", "bb", "ccc", "dddd", "eeeee", "ffffff" # ) }) unitizer_sect("Corner Cases", { # test maxlen overflows vetr:::strbullet(c("hello world"), maxlen=5L) # early fail vetr:::strbullet(c("hello world"), maxlen=12L) # fails when adding bullets vetr:::strbullet(c("hello world"), maxlen=14L) # works # these are all supposed to fail vetr:::test_strmcpy() vetr:::test_strappend() vetr:::test_add_szt() vetr:::strmlen(list(), 100L) # quickly confirm the other smprintfs work correctly vetr:::test_smprintfx() vetr:::test_strappend2() # warning }) unitizer_sect("substr", { vetr:::strsub(lorem.phrases, 25L, TRUE) vetr:::strsub(lorem.phrases, 25L, FALSE) # UTF8 vetr:::strsub(lorem.tr.phrases, 25L, TRUE) vetr:::strsub(lorem.tr.phrases, 25L, FALSE) vetr:::strsub(lorem.ru.phrases, 25L, TRUE) vetr:::strsub(lorem.ru.phrases, 25L, FALSE) vetr:::strsub(lorem.cn.phrases, 25L, TRUE) vetr:::strsub(lorem.cn.phrases, 25L, FALSE) # Unfortunately something is going wrong with how out-of-BMP unicode is read # in by windows so we have to comment out these tests; see #82 # vetr:::strsub(lorem.emo.phrases, 25L, TRUE) # vetr:::strsub(lorem.emo.phrases, 25L, FALSE) # Errors vetr:::strsub(lorem.phrases, 1:2, TRUE) vetr:::strsub(lorem.phrases, 25L, 1:2) vetr:::strsub(1:2, 25L, TRUE) vetr:::strsub(lorem.phrases, 2L, TRUE) vetr:::strsub(lorem.phrases, 3L, TRUE) # works }) unitizer_sect("nchar_u", { vetr:::nchar_u(1:10) vetr:::nchar_u(c("a", "ab", "abc")) }) unitizer_sect("char_offsets", { vetr:::char_offsets(1:10) vetr:::char_offsets(c("a", "ab", "abc")) }) unitizer_sect("UTF8 corner cases, in UTF-8", { # Originally we tried using `Sys.setlocale` but that isn't guaranteed to work # e.g. failed on windows utf8.kuhn <- readLines('unitizer/helper/UTF-8-test.txt', encoding='UTF-8'); test.start <- grep("^Here come the tests:", utf8.kuhn, useBytes=TRUE) test.start utf8.test <- tail(utf8.kuhn, -test.start) # suppressWarnings(utf8.test) # Solaris problems nchar.base <- nchar(utf8.test, allowNA=TRUE) untranslatable <- is.na(nchar.base) nchar.vetr <- vetr:::nchar_u(utf8.test) # Not all lines match between nchar and vetr, differences seem to be that the # invalid encoding starting \xf4\x90 is resolved to 1 character by base (test # 2.3.5), and that base resolves things in the U+D800 - U+DFFF range as 1 # character (5.1-5.2); perhaps base implements the more forgiving versions of # UTF8? base.vetr.diff <- !is.na(nchar.base) & nchar.vetr != nchar.base # # this actually causes problems with `str`, in particular with `strtrim` so # # we can't use it in the tests # sprintf( # "%d %d %s", # nchar.base[base.vetr.diff], # nchar.vetr[base.vetr.diff], # utf8.test[base.vetr.diff] # ) # For visual verification, according to docs all test lines should render at # 79 characters, but this appears to be in contravention to the UTF8 # documentation. This is the body of the e-mail I sent Dr. Kuhn inquiring # about the seeming discrepancy (which could possibly be explained by unicode # version differences): # # In section 3.3.3, 4-byte sequence with last byte missing, the sequence in # question appears to be f0 80 80 22 where 22 is the double quote ending the # sequence. Furthermore, you state that I should expect to "... see only a # single replacement character". # # What I'm a little confused by is that the sequence "f0 80 80" becomes # illegal after the first byte as per table 3.7 from the Unicode Standard # v10.0 since the second byte is less than 90. Then, by the definition of # "Maximal subpart of an ill-formed subsequence", it seems that the maximal # subpart starting at "f0" is actually "f0" since "f0 80" is not legal and # thus not a subpart. There is a close example in the documentation (p129 of # the document in question): # # Another example illustrates the application of the concept of maximal # subpart for UTF-8 continuation bytes outside the allowable ranges defined # in Table 3-7. The UTF-8 sequence <41 E0 9F 80 41> is ill-formed, because # <9F> is not an allowed second byte of a UTF-8 sequence commencing with # . In this case, there is an unconvertible offset at and the # maximal subpart at that offset is also . The subsequence # cannot be a maximal subpart, because it is not an initial subsequence of # any well-formed UTF-8 code unit sequence. # # This is basically the same issue with "e0" substituted for "f0" in our case # (I think). This would suggest we should see three characters, one for the # maximal subpart "f0", and then also one each for the two "80" since they # themselves are illegal. I think that aligns with the "recommended" policy. # I'm pretty new at this so I suspect I'm just misunderstanding something. If # you see an obvious mistake in my reasoning I would appreciate you pointing # me to it. # # Same issue with strtrim # paste( # ifelse(nchar.vetr > 79, sprintf("<%d>", nchar.vetr), " "), utf8.test # ) # Other examples from the Unicode 10.0 docs source('unitizer/helper/UTF-8-unicode-10-ex.R', local=TRUE) vetr:::nchar_u(unicode.10[1]) vetr:::nchar_u(unicode.10[2]) vetr:::nchar_u(unicode.10[3]) vetr:::nchar_u(unicode.10[4]) # Confirm offsets are what they should be vetr:::char_offsets(unicode.10[4]) # Check all the critical cases where we transition from legal to illegal # sequences source('unitizer/helper/UTF-8-critical.R', local=TRUE) Map(vetr:::char_offsets, crit.1) Map(vetr:::char_offsets, crit.2) Map(vetr:::char_offsets, crit.3) Map(vetr:::char_offsets, crit.4) }) unitizer_sect("UTF-8 corner cases - other encodings", { source('unitizer/helper/latin-1.R', local=TRUE) lapply(lat.1.1, vetr:::char_offsets) lapply(lat.1.2, vetr:::char_offsets) vetr:::strsub(lat.1.1, 3L, mark=FALSE) vetr:::strsub(lat.1.2, 3L, mark=FALSE) })