Fast multithreaded string searching in large text files.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

111 lines
2.1 KiB

6 years ago
  1. package main
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "os"
  7. "runtime"
  8. "sync"
  9. )
  10. const lf = byte('\n')
  11. func main() {
  12. if len(os.Args) != 2 {
  13. print("One argument required\n")
  14. os.Exit(1)
  15. }
  16. f, err := os.Open("locate.txt")
  17. if err != nil {
  18. panic(err)
  19. }
  20. defer f.Close()
  21. var needle = os.Args[1]
  22. // To make sure all threads have ended when the program finishes
  23. var wg sync.WaitGroup
  24. // Make a channel and a thread for each CPU core
  25. var cores = runtime.NumCPU()
  26. var linechan = make(chan []byte)
  27. for i := 0; i < cores; i++ {
  28. go scannerThread(linechan, []byte(needle), &wg)
  29. wg.Add(1)
  30. }
  31. var br = bufio.NewReaderSize(f, 1<<25) // reader with 32 MiB buffer
  32. var buf = make([]byte, 1<<20) // 1 MiB buffer for searching
  33. var remainder []byte
  34. var nread int
  35. for {
  36. nread, err = br.Read(buf)
  37. if err != nil && err != io.EOF {
  38. panic(err)
  39. }
  40. if err != io.EOF {
  41. // Get the remainder of the last line
  42. remainder, err = br.ReadBytes(byte('\n'))
  43. if err != nil && err != io.EOF {
  44. panic(err)
  45. }
  46. }
  47. linechan <- append(buf[:nread], remainder...)
  48. if err == io.EOF {
  49. break
  50. }
  51. }
  52. close(linechan)
  53. wg.Wait()
  54. }
  55. func scannerThread(linechannel chan []byte, needle []byte, wg *sync.WaitGroup) {
  56. var line []byte
  57. var ok bool
  58. var start, end int
  59. var linelen int
  60. var i int
  61. defer wg.Done()
  62. for {
  63. line, ok = <-linechannel
  64. if !ok {
  65. return // channel closed. we're done
  66. }
  67. // This loop is for every result found, when there are no more results it stops
  68. for i = bytes.Index(line, needle); i != -1; i = bytes.Index(line, needle) {
  69. start, end = i, i
  70. // needle was found, but where?
  71. for { // find the start
  72. if line[start] == lf {
  73. start++ // the line feed is from the previous line, so skip it
  74. break
  75. } else if start == 0 {
  76. break // result is at start of file
  77. }
  78. start--
  79. }
  80. for { // find the end
  81. if line[end] == lf || end == linelen-1 {
  82. break
  83. }
  84. end++
  85. }
  86. print(string(line[start:end]) + "\n")
  87. // Chop all of the bytes before the result off so it doesn't get
  88. // searched again
  89. line = line[end:]
  90. linelen = len(line)
  91. }
  92. }
  93. }