utf8_len <- function(str) {
len <- 0
str <- iconv(str, "", to = "UTF-8")
bytes <- strtoi(charToRaw(str), 16L)
for(i in bytes) {
if(i <= 127) {
len <- len + 1
} else if(i >= 192 && i <= 223) {
len <- len + 2
} else if(i >= 224 && i <= 239) {
len <- len + 3
} else if(i >= 240 && i <= 247) {
len <- len + 4
} else if(i >= 248 && i <= 251) {
len <- len + 5
} else if(i >= 252 && i <= 253) {
len <- len + 6
} else {
next
}
}
len
}
utf8_len2 <- function(str, from = '') {
length(charToRaw(iconv(str, from = from, to = "UTF-8")))
}
utf8_substr <- function(str, start = 1, stop = 0, from = '') {
raw <- charToRaw(iconv(str, from = from, to = "UTF-8"))
start <- max(1, start)
stop <- ifelse(stop <= 1, length(raw), stop)
start <- ifelse(start > stop, stop, start)
newstr <- iconv(rawToChar(raw[start:stop]), from = "UTF-8", to = "UTF-8")
}
str <- "b4dboy是个大帅哥"
print(nchar(str))
print(utf8_len(str))
print(utf8_len2(str, from='GBK'))
print(utf8_substr(str, start = 7))
print(utf8_substr(str, start = 1, stop = 9))