gpt4 book ai didi

regex - 使用 Julia 在字符串中每 10 个字符插入一个换行符

转载 作者:行者123 更新时间:2023-12-03 23:25:49 26 4
gpt4 key购买 nike

我想在蛋白质序列中每 10 个字符插入一个换行符:

seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"

在 Perl 中,这很容易:
$seq=~s/(.{10})/$1\n/g ; # does the job!

perl -e '$seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"; $seq=~s/(.{10})/$1\n/g; print $seq'
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ

在 Julia ,
replace(seq, r"(.{10})" , "\n")

不起作用,因为我不知道如何获取捕获组 (.{10}) 并将其替换为自身 + "\n"
julia> replace(seq, r"(.{10})" , "\n")
"\n\n\n\n\n\n"

所以要做到这一点,我需要 2 个步骤:
    julia> a=matchall(r"(.{1,10})" ,seq)
6-element Array{SubString{UTF8String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"
"RFFPYIALFQ"

julia> b=join(a, "\n")
"MSKNKSPLLN\nESEKMMSEML\nPMKVSQSKLN\nYEEKVYIPTT\nIRNRKQHCFR\nRFFPYIALFQ"

julia> println(b)
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ

# Caution :
a=matchall(r"(.{10})" ,seq) # wrong if seq is not exactly a multiple of 10 !

julia> seq
"MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIAL"

julia> matchall(r"(.{10})" ,seq)
5-element Array{SubString{UTF8String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"

julia> matchall(r"(.{1,10})" ,seq)
6-element Array{SubString{UTF8String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"
"RFFPYIAL"

是否有一步解决方案或更好(更快)的方法?

只是为了好玩一个包含所有这些有趣答案的基准! (使用 julia 5.0 更新)
function loop(a)
last = 0
#create the interval, in your case 10
salt = 10
#iterate in string (starts in the 10th value, don't forget julia use 1 to first index)
for i in salt:salt+1:length(a)
# replace the string for a new one with '\n'
a = string(a[1:i], '\n', a[i+1:length(a)])
last = Int64(i)
end
# replace the rest
a = string(a[1:length(a) - last % salt + 1], '\n', a[length(a) - last % salt + 2:length(a)])
println(a)
end

function regex1(seq)
a=matchall(r"(.{1,10})" ,seq)
b=join(a, "\n")
println(b)
end

function regex2(seq)
a=join(split(replace(seq, r"(.{10})", s"\1 ")), "\n")
println(a)
end

function regex3(seq)
a=replace(seq, r"(.{10})", Base.SubstitutionString("\\1\n"))
a= chomp(a) # because there is a new line at the end
println(a)
end

function intrapad(seq::String)
buf = IOBuffer((length(seq)*11)>>3) # big enough buffer
for i=1:10:length(seq)
write(buf,SubString(seq,i,i+9),'\n')
end
#return
print(takebuf_string(buf))
end

function join_substring(seq)
a=join((SubString(seq,i,i+9) for i=1:10:length(seq)),'\n')
println(a)
end

seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"

for i = 1:5
println("loop :")
@time loop(seq)
println("regex1 :")
@time regex1(seq)
println("regex2 :")
@time regex2(seq)
println("regex3 :")
@time regex3(seq)
println("intrapad :")
@time intrapad(seq)
println("join substring :")
@time join_substring(seq)
end

我将基准更改为 @time 执行 5 次,并在此处发布 @time 执行 5 次后的结果:
loop :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIA
LFQ
0.000013 seconds (53 allocations: 3.359 KB)
regex1 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000013 seconds (49 allocations: 1.344 KB)
regex2 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000017 seconds (47 allocations: 1.703 KB)
regex3 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000013 seconds (31 allocations: 976 bytes)
intrapad :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000007 seconds (9 allocations: 608 bytes)
join substring :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
0.000012 seconds (21 allocations: 800 bytes)

Intrapad 现在是第一个 ;)

最佳答案

就像@daycaster 建议的那样,您可以使用 s"\1"作为支持捕获组的替换字符串。麻烦的是专用s""字符串语法不支持特殊字符,如 \n .您可以通过手动构建 SubstitutionString 来解决此问题。对象,但随后您需要转义 \\1 :

julia> replace(seq, r"(.{10})", Base.SubstitutionString("\\1\n"))
"MSKNKSPLLN\nESEKMMSEML\nPMKVSQSKLN\nYEEKVYIPTT\nIRNRKQHCFR\nRFFPYIALFQ\n"

关于regex - 使用 Julia 在字符串中每 10 个字符插入一个换行符,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/40545980/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com