Fast multithreaded string searching in large text files.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

99 lines
2.4 KiB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
  1. package main
  2. import (
  3. "bytes"
  4. "io"
  5. "os"
  6. "runtime"
  7. "sync"
  8. )
  9. func main() {
  10. // prof, _ := os.Create("cprof")
  11. // pprof.StartCPUProfile(prof)
  12. // defer pprof.StopCPUProfile()
  13. if len(os.Args) != 2 {
  14. print("One argument required\n")
  15. os.Exit(1)
  16. }
  17. f, err := NewAsyncLineReader("locate.txt")
  18. if err != nil {
  19. panic(err)
  20. }
  21. defer f.Close()
  22. var needle = os.Args[1]
  23. // To make sure all threads have ended when the program finishes
  24. var wg sync.WaitGroup
  25. // Make a channel and a thread for each CPU core
  26. var threads = runtime.NumCPU()
  27. for i := 0; i < threads; i++ {
  28. go scannerThread(f, []byte(needle), &wg)
  29. wg.Add(1)
  30. }
  31. wg.Wait()
  32. }
  33. func scannerThread(r *AsyncLineReader, needle []byte, wg *sync.WaitGroup) {
  34. var i, linelen, start, end int
  35. var buf = make([]byte, 1<<24) // 1 MiB buffer for searching
  36. var err error
  37. defer wg.Done() // Sync up the goroutines when done
  38. for {
  39. linelen, err = r.Read(buf)
  40. if err != nil {
  41. if err == io.EOF {
  42. return
  43. }
  44. panic(err)
  45. }
  46. // This loop is for every result found, when there are no more results
  47. // it stops. When it runs for the first time the first index function is
  48. // executed, which indexes the entire batch. In the following iterations
  49. // the second index function is used, which begins searching where the
  50. // last result ended so it doesn't get found twice.
  51. for i = bytes.Index(buf, needle); i != -1; i = bytes.Index(buf[end:], needle) {
  52. start, end = i+end, i+end
  53. // needle was found, but where?
  54. for { // find the start (line feed of the line before it, or index 0)
  55. if buf[start] == lf {
  56. start++ // the line feed is from the previous line, so skip it
  57. break
  58. } else if start == 0 {
  59. break // result is at start of file
  60. }
  61. start--
  62. }
  63. for { // find the end (line feed at the end of the line)
  64. if buf[end] == lf {
  65. end++ // include the line feed in the line
  66. // https://stackoverflow.com/questions/26857582/in-a-go-slice-why-does-slohi-end-at-element-hi-1
  67. break
  68. } else if end == linelen {
  69. break
  70. }
  71. end++
  72. }
  73. // Print the result. Note that stdout is not necessarily
  74. // concurrency-safe, so in a real application this would have to be
  75. // passed through a channel.
  76. os.Stdout.Write(buf[start:end])
  77. // This is to keep track of where the end of the byte array is so we
  78. // can avoid index out of bounds panics
  79. linelen = linelen - end
  80. }
  81. end = 0
  82. }
  83. }