Skip to content

Commit c4fd202

Browse files
authored
Fix dedup perfs for big files (#27)
1 parent fa70f16 commit c4fd202

File tree

2 files changed

+70
-18
lines changed

2 files changed

+70
-18
lines changed

cmd/ipdex/file/file.go

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import (
1515
"os"
1616
"path/filepath"
1717
"regexp"
18-
"slices"
1918

2019
"github.com/crowdsecurity/crowdsec/pkg/cticlient"
2120
"github.com/pterm/pterm"
@@ -26,6 +25,40 @@ var (
2625
ipRegex = regexp.MustCompile(`(?:[0-9]{1,3}\.){3}[0-9]{1,3}|[a-fA-F0-9:]+`)
2726
)
2827

28+
func collectIPsFromFile(filePath string) ([]string, error) {
29+
readFile, err := os.Open(filePath)
30+
if err != nil {
31+
return nil, err
32+
}
33+
defer readFile.Close()
34+
35+
ipsToProcess := make([]string, 0)
36+
seenIPs := make(map[string]struct{})
37+
fileScanner := bufio.NewScanner(readFile)
38+
fileScanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
39+
fileScanner.Split(bufio.ScanLines)
40+
41+
for fileScanner.Scan() {
42+
line := fileScanner.Text()
43+
ipsMatch := ipRegex.FindAllString(line, -1)
44+
for _, ipAddr := range ipsMatch {
45+
if !config.IsValidIP(ipAddr) {
46+
continue
47+
}
48+
if _, exists := seenIPs[ipAddr]; exists {
49+
continue
50+
}
51+
seenIPs[ipAddr] = struct{}{}
52+
ipsToProcess = append(ipsToProcess, ipAddr)
53+
}
54+
}
55+
if err := fileScanner.Err(); err != nil {
56+
return nil, err
57+
}
58+
59+
return ipsToProcess, nil
60+
}
61+
2962
func FileCommand(file string, forceRefresh bool, yes bool) {
3063
outputFormat := viper.GetString(config.OutputFormatOption)
3164
filepath, err := filepath.Abs(file)
@@ -52,33 +85,23 @@ func FileCommand(file string, forceRefresh bool, yes bool) {
5285
reportExist = false
5386
}
5487
if !reportExist {
55-
readFile, err := os.Open(filepath)
88+
if outputFormat == display.HumanFormat {
89+
style.Infof("Scanning file '%s' for IPs...", filepath)
90+
}
91+
ipsToProcess, err = collectIPsFromFile(filepath)
5692
if err != nil {
5793
style.Fatal(err.Error())
5894
}
59-
60-
fileScanner := bufio.NewScanner(readFile)
61-
fileScanner.Split(bufio.ScanLines)
62-
for fileScanner.Scan() {
63-
line := fileScanner.Text()
64-
ipsMatch := ipRegex.FindAllString(line, -1)
65-
for _, ipAddr := range ipsMatch {
66-
if slices.Contains(ipsToProcess, ipAddr) {
67-
continue
68-
}
69-
if !config.IsValidIP(ipAddr) {
70-
continue
71-
}
72-
ipsToProcess = append(ipsToProcess, ipAddr)
73-
}
74-
}
7595
nbIPToProcess = len(ipsToProcess)
7696
if nbIPToProcess == 0 {
7797
if outputFormat == display.HumanFormat {
7898
style.Info("No valid IP addresses found in the file.")
7999
}
80100
return
81101
}
102+
if outputFormat == display.HumanFormat {
103+
style.Infof("Found %d unique IPs.", nbIPToProcess)
104+
}
82105
} else {
83106

84107
for _, ip := range report.IPs {

cmd/ipdex/file/file_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package file
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"reflect"
7+
"testing"
8+
)
9+
10+
func TestCollectIPsFromFileDeduplicatesAndKeepsValidIPs(t *testing.T) {
11+
t.Parallel()
12+
13+
tmpDir := t.TempDir()
14+
filePath := filepath.Join(tmpDir, "ips.txt")
15+
content := "1.1.1.1\ninvalid\n2.2.2.2 extra 1.1.1.1\n2001:4860:4860::8888\n999.1.1.1\n"
16+
if err := os.WriteFile(filePath, []byte(content), 0o600); err != nil {
17+
t.Fatalf("write temp file: %v", err)
18+
}
19+
20+
got, err := collectIPsFromFile(filePath)
21+
if err != nil {
22+
t.Fatalf("collectIPsFromFile returned error: %v", err)
23+
}
24+
25+
want := []string{"1.1.1.1", "2.2.2.2", "2001:4860:4860::8888"}
26+
if !reflect.DeepEqual(got, want) {
27+
t.Fatalf("unexpected IP list\nwant: %#v\ngot: %#v", want, got)
28+
}
29+
}

0 commit comments

Comments
 (0)