R语言判断UTF-8长度和切割中文字符串

  使用R清洗数据的时候发现UTF-8中文在R语言里使用nchar返回为1,str_length{stringr}返回为2, 正确的话中文在UTF-8里应该是占3个或以上的字节才对,这样我在对字符串切割的时候往往出现的结果不是我想要的。

网上搜了一圈也没找到解决办法,看来还是太小众了,为了解决这个问题我自己写了几个函数来进行处理,代码如下。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#判断长度的传统方法
utf8_len <- function(str) {
len <- 0
str <- iconv(str, "", to = "UTF-8")
bytes <- strtoi(charToRaw(str), 16L)
for(i in bytes) {
if(i <= 127) {
len <- len + 1
} else if(i >= 192 && i <= 223) {
len <- len + 2
} else if(i >= 224 && i <= 239) {
len <- len + 3
} else if(i >= 240 && i <= 247) {
len <- len + 4
} else if(i >= 248 && i <= 251) {
len <- len + 5
} else if(i >= 252 && i <= 253) {
len <- len + 6
} else {
next
}
}
len
}
#判断长度简易方法
utf8_len2 <- function(str, from = '') {
length(charToRaw(iconv(str, from = from, to = "UTF-8")))
}
#UTF-8字符串的切割
utf8_substr <- function(str, start = 1, stop = 0, from = '') {
raw <- charToRaw(iconv(str, from = from, to = "UTF-8"))
start <- max(1, start)
stop <- ifelse(stop <= 1, length(raw), stop)
start <- ifelse(start > stop, stop, start)
newstr <- iconv(rawToChar(raw[start:stop]), from = "UTF-8", to = "UTF-8")
}
str <- "b4dboy是个大帅哥"
print(nchar(str))
print(utf8_len(str))
print(utf8_len2(str, from='GBK'))
print(utf8_substr(str, start = 7))
print(utf8_substr(str, start = 1, stop = 9))