i have a big file (too big for memory), and i need to parse each record with the delimiter "|". The problem is, each record has a different size, thats why i always get the error "bufio.Scanner: token too long".
Is there any way, to retry the scan with a bigger buffersize, if i hit the bufio.Scanner: token too long error? Or is the bufio.Scanner not the right choice in my case, because i dont know the exact size of each record?
Thank you
https://go.dev/play/p/Erx15nXXCGk
package main
import (
"bufio"
"bytes"
"fmt"
"strings"
)
func main() {
// Example byte array
byteArray := []byte("data1|data2|data3|data4|data5|data6|data7|data8|data9|data10")
// Buffer size for reading chunks
bufferSize := 6
buf := make([]byte, bufferSize)
// Create a bufio.Scanner with a custom split function
scanner := bufio.NewScanner(bytes.NewReader(byteArray))
scanner.Buffer(buf, bufferSize)
scanner.Split(splitFunc)
// Read the byte array in chunks
for scanner.Scan() {
// Process each token (chunk)
chunk := scanner.Text()
fmt.Println("Chunk:", chunk, "Chunk Length:", len(chunk))
}
if err := scanner.Err(); err != nil {
if err == bufio.ErrTooLong {
fmt.Println("Error:", err)
fmt.Printf("Buffer size %d is too small...\n", bufferSize)
} else {
fmt.Println("Error:", err)
}
}
}
func splitFunc(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Return nothing if at end of file and no data passed
if atEOF && len(data) == 0 {
return 0, nil, nil
}
if i := strings.Index(string(data), "|"); i >= 0 {
return i + 1, data[0:i], nil
}
// If at end of file with data return the data
if atEOF {
return len(data), data, nil
}
return
}
You don't need to provide any buffer.
If you simply remove it, the program works fine. You'll be subject to MaxScanTokenSize which appears to be 64k.