-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlexer.cpp
More file actions
110 lines (90 loc) · 3.26 KB
/
lexer.cpp
File metadata and controls
110 lines (90 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#include "file_io.hpp"
#include "token.hpp"
#include "fsm.hpp"
#include "winnow.hpp"
#include "similarity.hpp"
#include <vector>
#include <iostream>
#include <string>
#include <stdbool.h>
#include <thread>
#include <chrono>
#include <filesystem>
namespace fs = filesystem;
void process_file(const char *filename, bool originalFile) {
FILE *file = open_file(filename);
if (file != NULL) {
string fname_str(filename);
FILE *logTokens = initializeOutputFile((fname_str + "_output.txt").c_str(), "tokens");
FILE *logFingerprints = initializeOutputFile((fname_str + "_fingerprints.txt").c_str(), "fingerprints");
FSM mach;
vector<Token> tokens = mach.fsm(file);
for (const auto& token : tokens) {
token.describe(logTokens, mach.iToken);
}
fseek(logTokens, 0, SEEK_SET);
if (originalFile) {
auto result = winnow(5, 3, logFingerprints, mach.iToken, mach.iToken.size(), true);
if (result)
orig_fingerprints = *result;
}
else
winnow(5, 3, logFingerprints, mach.iToken, mach.iToken.size(), false);
mach.iToken.clear();
close_file(filename, file);
fclose(logTokens);
fclose(logFingerprints);
}
else {
cerr << "Failed to open file: " << filename << endl;
}
}
int main(int argc, char **argv) {
if (argc < 3) {
//cerr << "Usage: " << argv[0] << " <original_file> <input_file1> <input_file2> ... <input_fileN>" << endl;
cerr << "Usage: " << argv[0] << " <original_file> <input_file>" << endl;
return 1;
}
if (!fs::exists("analysis")) {
fs::create_directories("analysis");
}
auto start = chrono::high_resolution_clock::now();
process_file(argv[1], true); // Process the original file separately
for (const unsigned int &fingerprint : orig_fingerprints) {
++orig_fingerprintMap[fingerprint];
}
if (string(argv[argc - 1]) == "--m" || string(argv[argc - 1]) == "--M") {
const unsigned int lprocs = max(1u, thread::hardware_concurrency());
vector<std::thread> threads;
for (int i = 2; i < argc - 1; ++i) {
threads.emplace_back([filename = argv[i]]() {
process_file(filename, false);
});
if (threads.size() >= lprocs) {
for (auto& t : threads) {
t.join();
}
threads.clear();
}
}
for (auto& t : threads) {
t.join();
}
}
else if (string(argv[argc - 1]) != "--m" && string(argv[argc - 1]) != "--M") {
for (int i = 2; i < argc; ++i) {
process_file(argv[i], false);
string fpath = "analysis/fingerprints/" + string(argv[i]) + "_fingerprints.txt";
jaccard_similarity(fpath, argv[i], argv[1]);
}
}
else {
cerr << "Invalid option: " << argv[argc - 1] << endl;
cerr << "Usage: " << argv[0] << " <original_file> <input_file> --m" << endl;
return 1;
}
auto end = chrono::high_resolution_clock::now();
chrono::duration<double> duration = end - start;
cout << duration.count() << " seconds" << endl;
return 0;
}