gpt4 book ai didi

go - 使用Golang从文本文件末尾读取特定内容的有效方法

转载 作者:行者123 更新时间:2023-12-01 22:38:51 26 4
gpt4 key购买 nike

我有一个大的文本日志文件,其中包含两部分,并用一些特殊字符分隔,就像

...
this is the very large
part, contains lots lines.

#SPECIAL CHARS START#

...
this is the small part the the end,
contain several lines, but
we do not know how many lines
this part contains


我的要求是获得 #SPECIAL CHARS START#之后到最后的一小部分文本内容,如何使用Golang有效地获得它?

更新:
我当前的解决方案是从文件末尾逐行获取并记住光标,如果该行包含特殊字符,则中断循环并使用reareay获取光标
func getBackwardLine(file *os.File, start int64) (string, int64) {
line := ""
cursor :=start
stat, _ := file.Stat()
filesize := stat.Size()

for {
cursor--
file.Seek(cursor, io.SeekEnd)

char := make([]byte, 1)
file.Read(char)

if cursor != -1 && (char[0] == 10 || char[0] == 13) {
break
}

line = fmt.Sprintf("%s%s", string(char), line)

if cursor == -filesize {
break
}
}
return line, cursor

}

func main() {
file, err := os.Open("some.log")
if err != nil {
os.Exit(1)
}
defer file.Close()

var cursor int64 = 0
var line = ""

for {
line, cursor = getBackwardLine(file, cursor)
fmt.Println(line)
if(strings.Contains(line, "#SPECIAL CHARS START#")) {
break
}
}


fmt.Println(cursor) //now we get the cursor for the start of special characters
}

最佳答案

该解决方案实现了向后阅读器。

它从头开始按b.Len字节块读取文件,然后向前寻找一个分隔符,当前为该块内的\n,然后以SepIndex推进起始偏移量(这是为了防止将搜索字符串分成两个连续的字符串读)。在进行下一个块读取之前,它将在读取的块中查找search字符串,如果找到,它将返回其在文件中的起始位置并停止。
否则,它将通过b.Len减小起始偏移量,然后读取下一个块。

只要您的搜索字符串位于文件的最后40%,您就应该获得更好的性能,,但则需要进行实战测试。

如果您的搜索字符串在最后10%以内,我相信您会赢。

main.go

package main

import (
"bytes"
"flag"
"fmt"
"io"
"log"
"os"
"time"

"github.com/mattetti/filebuffer"
)

func main() {

var search string
var sep string
var verbose bool
flag.StringVar(&search, "search", "findme", "search word")
flag.StringVar(&sep, "sep", "\n", "separator for the search detection")
flag.BoolVar(&verbose, "v", false, "verbosity")
flag.Parse()

d := make(chan struct{})
b := &bytes.Buffer{}
go func() {
io.Copy(b, os.Stdin)
d <- struct{}{}
}()
<-time.After(time.Millisecond)
select {
case <-d:
default:
os.Stdin.Close()
}

readSize := 1024
if b.Len() < 1 {
input := fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), readSize-5), search)
input += input
b.WriteString(input)
}

bsearch := []byte(search)
s, err := bytesSearch(b.Bytes())
if err != nil {
log.Fatal(err)
}
if verbose {
s.logger = log.New(os.Stderr, "", log.LstdFlags)
}
s.Buffer = make([]byte, readSize)
s.Sep = []byte(sep)
got, err := s.Index(bsearch)
if err != nil {
log.Fatal(err)
}
fmt.Println("Index ", got)
got, err = s.Index2(bsearch)
if err != nil {
log.Fatal(err)
}
fmt.Println("Index ", got)

}

type tailSearch struct {
F io.ReadSeeker
Buffer []byte
Sep []byte
start int64
logger interface {
Println(...interface{})
}
}

func fileSearch(f *os.File) (ret tailSearch, err error) {
ret.F = f
st, err := f.Stat()
if err != nil {
return
}
ret.start = st.Size()
ret.Sep = []byte("\n")
return ret, nil
}

func bytesSearch(b []byte) (ret tailSearch, err error) {
ret.F = filebuffer.New(b)
ret.start = int64(len(b))
ret.Sep = []byte("\n")
return
}

func (b tailSearch) Index(search []byte) (int64, error) {

if b.Buffer == nil {
b.Buffer = make([]byte, 1024, 1024)
}
buf := b.Buffer
blen := len(b.Buffer)

hasended := false
for !hasended {
if b.logger != nil {
b.logger.Println("a start", b.start)
}
offset := b.start - int64(blen)
if offset < 0 {
offset = 0
hasended = true
}
_, err := b.F.Seek(offset, 0)
if err != nil {
hasended = true
}
n, err := b.F.Read(buf)
if b.logger != nil {
b.logger.Println("f n", n, "err", err)
}
if err != nil {
hasended = true
}
buf = buf[:n]
b.start -= int64(n)
if b.logger != nil {
b.logger.Println("g start", b.start)
}
if b.start > 0 {
i := bytes.Index(buf, b.Sep)
if b.logger != nil {
b.logger.Println("h sep", i)
}
if i > -1 {
b.start += int64(i)
buf = buf[i:]
if b.logger != nil {
b.logger.Println("i start", b.start)
}
}
}
if e := bytes.LastIndex(buf, search); e > -1 {
return b.start + int64(e), nil
}
}

return -1, nil
}

func (b tailSearch) Index2(search []byte) (int64, error) {

if b.Buffer == nil {
b.Buffer = make([]byte, 1024, 1024)
}
buf := b.Buffer
blen := len(b.Buffer)

hasended := false
for !hasended {
if b.logger != nil {
b.logger.Println("a start", b.start)
}
offset := b.start - int64(blen)
if offset < 0 {
offset = 0
hasended = true
}
_, err := b.F.Seek(offset, 0)
if err != nil {
hasended = true
}

n, err := b.F.Read(buf)
if b.logger != nil {
b.logger.Println("f n", n, "err", err)
}
if err != nil {
hasended = true
}
buf = buf[:n]
b.start -= int64(n)

if b.logger != nil {
b.logger.Println("g start", b.start)
}

for i := 1; i < len(search); i++ {
if bytes.HasPrefix(buf, search[i:]) {
e := i - len(search)
b.start += int64(e)
buf = buf[e:]
}
}
if b.logger != nil {
b.logger.Println("g start", b.start)
}

if e := bytes.LastIndex(buf, search); e > -1 {
return b.start + int64(e), nil
}
}

return -1, nil
}

main_test.go
package main

import (
"bytes"
"fmt"
"strings"
"testing"
)

func TestOne(t *testing.T) {

type test struct {
search []byte
readLen int
input string
sep []byte
want int64
}

search := []byte("find me")
blockLen := 1024
tests := []test{
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%stail content", search),
want: 0,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf(""),
want: -1,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: strings.Repeat("nop\n", 10000),
want: -1,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen-5), search),
want: 1019,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen), search),
want: 1024,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen+10), search),
want: 1034,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), (blockLen*2)+10), search),
want: 2058,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%s%stail content", bytes.Repeat([]byte(" "), (blockLen*2)+10), search, search),
want: 2065,
},
}

for i, test := range tests {
s, err := bytesSearch([]byte(test.input))
if err != nil {
t.Fatal(err)
}
s.Buffer = make([]byte, test.readLen)
s.Sep = test.sep
got, err := s.Index(test.search)
if err != nil {
t.Fatal(err)
}
if got != test.want {
t.Fatalf("invalid index at %v got %v wanted %v", i, got, test.want)
}
got, err = s.Index2(test.search)
if err != nil {
t.Fatal(err)
}
if got != test.want {
t.Fatalf("invalid index at %v got %v wanted %v", i, got, test.want)
}
}

}

bench_test.go
package main

import (
"bytes"
"fmt"
"testing"

"github.com/mattetti/filebuffer"
)

func BenchmarkIndex(b *testing.B) {
search := []byte("find me")
blockLen := 1024
input := fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen-5), search)
input += input
s := tailSearch{}
s.F = filebuffer.New([]byte(input))
s.Buffer = make([]byte, blockLen)
for i := 0; i < b.N; i++ {
s.start = int64(len(input))
_, err := s.Index(search)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkIndex2(b *testing.B) {
search := []byte("find me")
blockLen := 1024
input := fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen-5), search)
input += input
s := tailSearch{}
s.F = filebuffer.New([]byte(input))
s.Buffer = make([]byte, blockLen)
for i := 0; i < b.N; i++ {
s.start = int64(len(input))
_, err := s.Index2(search)
if err != nil {
b.Fatal(err)
}
}
}

测试
$ go test -v
=== RUN TestOne
--- PASS: TestOne (0.00s)
PASS
ok test/backwardsearch 0.002s
$ go test -bench=. -benchmem -v
=== RUN TestOne
--- PASS: TestOne (0.00s)
goos: linux
goarch: amd64
pkg: test/backwardsearch
BenchmarkIndex-4 20000000 108 ns/op 0 B/op 0 allocs/op
BenchmarkIndex2-4 10000000 167 ns/op 0 B/op 0 allocs/op
PASS
ok test/backwardsearch 4.129s
$ echo "rrrrfindme" | go run main.go -v
2019/10/17 12:17:04 a start 11
2019/10/17 12:17:04 f n 11 err <nil>
2019/10/17 12:17:04 g start 0
Index 4
2019/10/17 12:17:04 a start 11
2019/10/17 12:17:04 f n 11 err <nil>
2019/10/17 12:17:04 g start 0
2019/10/17 12:17:04 g start 0
Index 4
$ cat bench_test.go | go run main.go -search main
Index 8
Index 8
$ go run main.go
Index 2056
Index 2056

关于go - 使用Golang从文本文件末尾读取特定内容的有效方法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58407633/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com