add custom reader + mmap
This commit is contained in:
parent
87c6dc267d
commit
1b7bb98b5e
72
asyncreader.go
Normal file
72
asyncreader.go
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"golang.org/x/exp/mmap"
|
||||||
|
)
|
||||||
|
|
||||||
|
const lf = byte('\n')
|
||||||
|
const null = byte(0)
|
||||||
|
|
||||||
|
type AsyncLineReader struct {
|
||||||
|
offset int64
|
||||||
|
eof bool
|
||||||
|
file *mmap.ReaderAt
|
||||||
|
mu *sync.Mutex
|
||||||
|
remainder []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewAsyncLineReader(name string) (*AsyncLineReader, error) {
|
||||||
|
var a = &AsyncLineReader{
|
||||||
|
offset: 0,
|
||||||
|
eof: false,
|
||||||
|
mu: &sync.Mutex{},
|
||||||
|
}
|
||||||
|
|
||||||
|
var err error
|
||||||
|
a.file, err = mmap.Open(name)
|
||||||
|
|
||||||
|
return a, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *AsyncLineReader) Read(b []byte) (int, error) {
|
||||||
|
if a.eof {
|
||||||
|
return 0, io.EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
a.mu.Lock()
|
||||||
|
defer a.mu.Unlock()
|
||||||
|
|
||||||
|
var n, err = a.file.ReadAt(b, a.offset)
|
||||||
|
if err == io.EOF {
|
||||||
|
a.eof = true
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
if n == 0 {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// find the last line feed in the batch
|
||||||
|
for {
|
||||||
|
n--
|
||||||
|
|
||||||
|
if b[n] == lf {
|
||||||
|
// selected character is a line feed, we're done
|
||||||
|
break
|
||||||
|
} else if n == 0 {
|
||||||
|
break // result is at start of file
|
||||||
|
}
|
||||||
|
|
||||||
|
b[n] = null // Discard this byte so it doesn't get read multiple times
|
||||||
|
}
|
||||||
|
|
||||||
|
a.offset += int64(n)
|
||||||
|
|
||||||
|
return n, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *AsyncLineReader) Close() error {
|
||||||
|
return a.file.Close()
|
||||||
|
}
|
2
build.sh
2
build.sh
@ -1,3 +1,3 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
go build -o rapidgrep -ldflags="-s -w" main.go
|
go build -o rapidgrep -ldflags="-s -w" asyncreader.go main.go
|
||||||
|
71
main.go
71
main.go
@ -1,28 +1,24 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"bytes"
|
"bytes"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"runtime"
|
"runtime"
|
||||||
"runtime/pprof"
|
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
const lf = byte('\n')
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
f, _ := os.Create("cprof")
|
// prof, _ := os.Create("cprof")
|
||||||
pprof.StartCPUProfile(f)
|
// pprof.StartCPUProfile(prof)
|
||||||
defer pprof.StopCPUProfile()
|
// defer pprof.StopCPUProfile()
|
||||||
|
|
||||||
if len(os.Args) != 2 {
|
if len(os.Args) != 2 {
|
||||||
print("One argument required\n")
|
print("One argument required\n")
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
f, err := os.Open("locate.txt")
|
f, err := NewAsyncLineReader("locate.txt")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
@ -34,68 +30,42 @@ func main() {
|
|||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
// Make a channel and a thread for each CPU core
|
// Make a channel and a thread for each CPU core
|
||||||
var cores = runtime.NumCPU()
|
var threads = runtime.NumCPU()
|
||||||
var workchannel = make(chan []byte)
|
for i := 0; i < threads; i++ {
|
||||||
for i := 0; i < cores; i++ {
|
go scannerThread(f, []byte(needle), &wg)
|
||||||
go scannerThread(workchannel, []byte(needle), &wg)
|
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
var br = bufio.NewReaderSize(f, 1<<25) // reader with 32 MiB buffer
|
|
||||||
var buf = make([]byte, 1<<20) // 1 MiB buffer for searching
|
|
||||||
var remainder []byte
|
|
||||||
var nread int
|
|
||||||
|
|
||||||
for {
|
|
||||||
nread, err = br.Read(buf)
|
|
||||||
if err != nil && err != io.EOF {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
if err != io.EOF {
|
|
||||||
// Get the remainder of the last line
|
|
||||||
remainder, err = br.ReadBytes(byte('\n'))
|
|
||||||
if err != nil && err != io.EOF {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
workchannel <- append(buf[:nread], remainder...)
|
|
||||||
|
|
||||||
if err == io.EOF {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
close(workchannel)
|
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func scannerThread(workchannel chan []byte, needle []byte, wg *sync.WaitGroup) {
|
func scannerThread(r *AsyncLineReader, needle []byte, wg *sync.WaitGroup) {
|
||||||
var batch []byte
|
|
||||||
var ok bool
|
|
||||||
var i, linelen, start, end int
|
var i, linelen, start, end int
|
||||||
|
var buf = make([]byte, 1<<24) // 1 MiB buffer for searching
|
||||||
|
var err error
|
||||||
|
|
||||||
defer wg.Done() // Sync up the goroutines when done
|
defer wg.Done() // Sync up the goroutines when done
|
||||||
|
|
||||||
for {
|
for {
|
||||||
batch, ok = <-workchannel
|
linelen, err = r.Read(buf)
|
||||||
if !ok {
|
if err != nil {
|
||||||
return // channel closed. we're done
|
if err == io.EOF {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
linelen = len(batch)
|
|
||||||
|
|
||||||
// This loop is for every result found, when there are no more results
|
// This loop is for every result found, when there are no more results
|
||||||
// it stops. When it runs for the first time the first index function is
|
// it stops. When it runs for the first time the first index function is
|
||||||
// executed, which indexes the entire batch. In the following iterations
|
// executed, which indexes the entire batch. In the following iterations
|
||||||
// the second index function is used, which begins searching where the
|
// the second index function is used, which begins searching where the
|
||||||
// last result ended so it doesn't get found twice.
|
// last result ended so it doesn't get found twice.
|
||||||
for i = bytes.Index(batch, needle); i != -1; i = bytes.Index(batch[end:], needle) {
|
for i = bytes.Index(buf, needle); i != -1; i = bytes.Index(buf[end:], needle) {
|
||||||
start, end = i+end, i+end
|
start, end = i+end, i+end
|
||||||
// needle was found, but where?
|
// needle was found, but where?
|
||||||
|
|
||||||
for { // find the start (line feed of the line before it, or index 0)
|
for { // find the start (line feed of the line before it, or index 0)
|
||||||
if batch[start] == lf {
|
if buf[start] == lf {
|
||||||
start++ // the line feed is from the previous line, so skip it
|
start++ // the line feed is from the previous line, so skip it
|
||||||
break
|
break
|
||||||
} else if start == 0 {
|
} else if start == 0 {
|
||||||
@ -105,7 +75,7 @@ func scannerThread(workchannel chan []byte, needle []byte, wg *sync.WaitGroup) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for { // find the end (line feed at the end of the line)
|
for { // find the end (line feed at the end of the line)
|
||||||
if batch[end] == lf {
|
if buf[end] == lf {
|
||||||
end++ // include the line feed in the line
|
end++ // include the line feed in the line
|
||||||
// https://stackoverflow.com/questions/26857582/in-a-go-slice-why-does-slohi-end-at-element-hi-1
|
// https://stackoverflow.com/questions/26857582/in-a-go-slice-why-does-slohi-end-at-element-hi-1
|
||||||
break
|
break
|
||||||
@ -118,11 +88,12 @@ func scannerThread(workchannel chan []byte, needle []byte, wg *sync.WaitGroup) {
|
|||||||
// Print the result. Note that stdout is not necessarily
|
// Print the result. Note that stdout is not necessarily
|
||||||
// concurrency-safe, so in a real application this would have to be
|
// concurrency-safe, so in a real application this would have to be
|
||||||
// passed through a channel.
|
// passed through a channel.
|
||||||
os.Stdout.Write(batch[start:end])
|
os.Stdout.Write(buf[start:end])
|
||||||
|
|
||||||
// This is to keep track of where the end of the byte array is so we
|
// This is to keep track of where the end of the byte array is so we
|
||||||
// can avoid index out of bounds panics
|
// can avoid index out of bounds panics
|
||||||
linelen = linelen - end
|
linelen = linelen - end
|
||||||
}
|
}
|
||||||
|
end = 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user