gpt4 book ai didi

r - R中基于交替字符的分割字符串

转载 作者:行者123 更新时间:2023-12-02 12:16:57 24 4
gpt4 key购买 nike

我正在尝试找出一种有效的方法来分割字符串,例如

"111110000011110000111000"

转化为向量

[1] "11111" "00000" "1111" "0000" "111" "000"

其中“0”和“1”可以是任意交替字符。

最佳答案

尝试

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"

更新

使用stri_extract_all_regex修改@rawr的解决方案

library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"


stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111"
#[10] "000"

stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111"
#[8] "00000" "222" "aaa" "bb" "cc" "d" "11"
#[15] "D" "aa" "BB"

基准

library(stringi) 
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)

akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3,
perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3),
rle(strsplit(x3, "")[[1]])$lengths,
colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))

Cryo <- function(){
res_vector=rep(NA_character_,nchar(x3))
res_vector[1]=substr(x3,1,1)
counter=1
old_tmp=''

for (i in 2:nchar(x3)) {
tmp=substr(x3,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}

res_vector[!is.na(res_vector)]
}


richard <- function(){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

nicola<-function(x) {
indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
substring(x,indices[-length(indices)]+1,indices[-1])
}

richard2 <- function() {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

system.time(akrun())
# user system elapsed
# 0.003 0.000 0.003

system.time(thelate())
# user system elapsed
# 0.272 0.001 0.274

system.time(rawr())
# user system elapsed
# 0.397 0.001 0.398

system.time(ananda())
# user system elapsed
# 3.744 0.204 3.949

system.time(Colonel())
# user system elapsed
# 0.154 0.001 0.154

system.time(Cryo())
# user system elapsed
# 0.220 0.005 0.226

system.time(richard())
# user system elapsed
# 0.007 0.000 0.006

system.time(nicola(x3))
# user system elapsed
# 0.190 0.001 0.191

在稍大一点的绳子上,

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

system.time(akrun())
#user system elapsed
#0.166 0.000 0.155
system.time(richard())
# user system elapsed
# 0.606 0.000 0.569
system.time(richard2())
# user system elapsed
# 0.518 0.000 0.487

system.time(Colonel())
# user system elapsed
# 9.631 0.000 9.358


library(microbenchmark)
microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
#Unit: relative
# expr min lq mean median uq max neval cld
# richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581 20 b
#richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861 20 b
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a

注意:尝试运行其他方法,但需要很长时间

数据

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

关于r - R中基于交替字符的分割字符串,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/29383296/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com