Fast multithreaded string searching in large text files.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

111 lines
2.1 KiB

package main
import (
"bufio"
"bytes"
"io"
"os"
"runtime"
"sync"
)
const lf = byte('\n')
func main() {
if len(os.Args) != 2 {
print("One argument required\n")
os.Exit(1)
}
f, err := os.Open("locate.txt")
if err != nil {
panic(err)
}
defer f.Close()
var needle = os.Args[1]
// To make sure all threads have ended when the program finishes
var wg sync.WaitGroup
// Make a channel and a thread for each CPU core
var cores = runtime.NumCPU()
var linechan = make(chan []byte)
for i := 0; i < cores; i++ {
go scannerThread(linechan, []byte(needle), &wg)
wg.Add(1)
}
var br = bufio.NewReaderSize(f, 1<<25) // reader with 32 MiB buffer
var buf = make([]byte, 1<<20) // 1 MiB buffer for searching
var remainder []byte
var nread int
for {
nread, err = br.Read(buf)
if err != nil && err != io.EOF {
panic(err)
}
if err != io.EOF {
// Get the remainder of the last line
remainder, err = br.ReadBytes(byte('\n'))
if err != nil && err != io.EOF {
panic(err)
}
}
linechan <- append(buf[:nread], remainder...)
if err == io.EOF {
break
}
}
close(linechan)
wg.Wait()
}
func scannerThread(linechannel chan []byte, needle []byte, wg *sync.WaitGroup) {
var line []byte
var ok bool
var start, end int
var linelen int
var i int
defer wg.Done()
for {
line, ok = <-linechannel
if !ok {
return // channel closed. we're done
}
// This loop is for every result found, when there are no more results it stops
for i = bytes.Index(line, needle); i != -1; i = bytes.Index(line, needle) {
start, end = i, i
// needle was found, but where?
for { // find the start
if line[start] == lf {
start++ // the line feed is from the previous line, so skip it
break
} else if start == 0 {
break // result is at start of file
}
start--
}
for { // find the end
if line[end] == lf || end == linelen-1 {
break
}
end++
}
print(string(line[start:end]) + "\n")
// Chop all of the bytes before the result off so it doesn't get
// searched again
line = line[end:]
linelen = len(line)
}
}
}