Fast multithreaded string searching in large text files.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

99 lines
2.4 KiB

package main
import (
"bytes"
"io"
"os"
"runtime"
"sync"
)
func main() {
// prof, _ := os.Create("cprof")
// pprof.StartCPUProfile(prof)
// defer pprof.StopCPUProfile()
if len(os.Args) != 2 {
print("One argument required\n")
os.Exit(1)
}
f, err := NewAsyncLineReader("locate.txt")
if err != nil {
panic(err)
}
defer f.Close()
var needle = os.Args[1]
// To make sure all threads have ended when the program finishes
var wg sync.WaitGroup
// Make a channel and a thread for each CPU core
var threads = runtime.NumCPU()
for i := 0; i < threads; i++ {
go scannerThread(f, []byte(needle), &wg)
wg.Add(1)
}
wg.Wait()
}
func scannerThread(r *AsyncLineReader, needle []byte, wg *sync.WaitGroup) {
var i, linelen, start, end int
var buf = make([]byte, 1<<24) // 1 MiB buffer for searching
var err error
defer wg.Done() // Sync up the goroutines when done
for {
linelen, err = r.Read(buf)
if err != nil {
if err == io.EOF {
return
}
panic(err)
}
// This loop is for every result found, when there are no more results
// it stops. When it runs for the first time the first index function is
// executed, which indexes the entire batch. In the following iterations
// the second index function is used, which begins searching where the
// last result ended so it doesn't get found twice.
for i = bytes.Index(buf, needle); i != -1; i = bytes.Index(buf[end:], needle) {
start, end = i+end, i+end
// needle was found, but where?
for { // find the start (line feed of the line before it, or index 0)
if buf[start] == lf {
start++ // the line feed is from the previous line, so skip it
break
} else if start == 0 {
break // result is at start of file
}
start--
}
for { // find the end (line feed at the end of the line)
if buf[end] == lf {
end++ // include the line feed in the line
// https://stackoverflow.com/questions/26857582/in-a-go-slice-why-does-slohi-end-at-element-hi-1
break
} else if end == linelen {
break
}
end++
}
// Print the result. Note that stdout is not necessarily
// concurrency-safe, so in a real application this would have to be
// passed through a channel.
os.Stdout.Write(buf[start:end])
// This is to keep track of where the end of the byte array is so we
// can avoid index out of bounds panics
linelen = linelen - end
}
end = 0
}
}