diff --git a/.gitignore b/.gitignore index cd3cbc0..8eb07c9 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ junk/ # linux binaries bin/freq* bin/hack* + +# osx trash +*.dSYM diff --git a/build/hopscotch/.gitignore b/build/hopscotch/.gitignore new file mode 100644 index 0000000..28bcef5 --- /dev/null +++ b/build/hopscotch/.gitignore @@ -0,0 +1,12 @@ +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps +freq \ No newline at end of file diff --git a/build/hopscotch/CMakeLists.txt b/build/hopscotch/CMakeLists.txt new file mode 100644 index 0000000..8be7b30 --- /dev/null +++ b/build/hopscotch/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.2) + +project(freq) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O3") + +set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}) + +add_subdirectory(hopscotch-map) + +add_executable(freq src/main.cpp) +target_link_libraries(freq PRIVATE tsl::hopscotch_map) diff --git a/build/hopscotch/hopscotch-map b/build/hopscotch/hopscotch-map new file mode 160000 index 0000000..29030f5 --- /dev/null +++ b/build/hopscotch/hopscotch-map @@ -0,0 +1 @@ +Subproject commit 29030f55ca518bb1be5113ab0a8e134772024a9d diff --git a/build/hopscotch/src/main.cpp b/build/hopscotch/src/main.cpp new file mode 100644 index 0000000..d64ea52 --- /dev/null +++ b/build/hopscotch/src/main.cpp @@ -0,0 +1,165 @@ +/* + + clang++ -std=c++ -O3 src/freq04.cpp -o freq + +*/ + +#include +#include +#include +#include +#include +#include + +#include +#include // open + +#if defined(_WIN32) || defined(WIN32) +// windows +#include "ext/windows-mmap.h" +#include +#define lseek64 _lseeki64 +#pragma warning(disable : 4996) +#else +// unix +#include // mmap +#include // lseek +#endif + +#if __APPLE__ +#define lseek64 lseek +#endif + +#if __linux__ +#include +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) +#define _MAP_POPULATE_AVAILABLE +#endif +#endif + +#ifdef _MAP_POPULATE_AVAILABLE +#define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) +#else +#define MMAP_FLAGS MAP_PRIVATE +#endif + +typedef unsigned char byte; +typedef unsigned long long ullong; + +class Word { + public: + Word(const std::string &s, int i) : text(s), occurances(i) {} + + std::string text; + int occurances; +}; + +int main(int argc, char **argv) { + if (argc != 3) { + exit(1); + } + + int fd = open(argv[1], O_RDONLY); + if (fd == -1) { + std::cerr << "Can't open file" << std::endl; + exit(1); + } + + FILE *out = fopen(argv[2], "wb+"); + if (!out) { + std::cerr << "failed to open output file" << std::endl; + exit(1); + } + + const size_t fsz = lseek(fd, 0, SEEK_END); + if (fsz == -1) { + std::cerr << "Failed to calculate file size" << std::endl; + exit(1); + } + + + const uint8_t *begin = reinterpret_cast( + mmap(NULL, fsz, PROT_READ, MMAP_FLAGS, fd, 0)); + + if (begin == nullptr || begin == MAP_FAILED) { + std::cerr << "Unable to mmap" << std::endl; + exit(1); + } + + std::cout << "file mmap'ed" << std::endl; + + char letters[256]; + memset(letters, 0, sizeof(letters)); + for (size_t c = 0; c < 256; ++c) { + if (c >= 'a' and c <= 'z') { + letters[c] = c; + } + if (c >= 'A' and c <= 'Z') { + letters[c] = c - 'A' + 'a'; + } + } + + std::cout << "letters: " << std::string(letters, 256) << std::endl; + + const char *start = nullptr; + size_t len = 0; + + /* + * Calculating the hash and comparing two std::string may be slow. + * We can store the hash of each std::string in the hash map to make + * the inserts and lookups faster by setting StoreHash to true. + */ + tsl::hopscotch_map, + std::equal_to, + std::allocator>, + 30, true> map; + + auto end = begin + fsz; + auto word = std::string(start, len); + + for (auto s = begin; s != end; ++s) { + const auto ch = letters[*s]; + + if (ch) { + if (not start) { + start = reinterpret_cast(s); + } + ++len; + continue; + } else if (len == 0) { + continue; + } else { + word.assign(start, len); + + for (size_t i = 0; i< len; ++i) { + word[i] = letters[word[i]]; + } + + if (map.find(word) != map.end()) { + map[word]++; + } else { + map[word] = 1; + } + } + + len = 0; + start = nullptr; + } + + std::vector words; + words.reserve(map.size()); + + for(const auto& key_value : map) { + // TODO string copied here?.. + words.emplace_back(Word(key_value.first, key_value.second)); + } + + + std::sort(words.begin(), words.end(), [](const Word& a, const Word& b) -> bool { + return a.occurances > b.occurances; + }); + + for (const auto& word : words) { + fprintf(out, "%d %s\n", word.occurances, word.text.c_str()); + } +} diff --git a/src/freq01.cpp b/src/freq01.cpp index 3b301f0..7ab99f5 100644 --- a/src/freq01.cpp +++ b/src/freq01.cpp @@ -22,6 +22,23 @@ #include #endif +#if __APPLE__ +#define lseek64 lseek +#endif + +#if __linux__ +#include +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) +#define _MAP_POPULATE_AVAILABLE +#endif +#endif + +#ifdef _MAP_POPULATE_AVAILABLE +#define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) +#else +#define MMAP_FLAGS MAP_PRIVATE +#endif + constexpr uint32_t H = 2166136261; int usage(char *process_name) { @@ -88,8 +105,14 @@ int main(int argc, char **argv) { } const size_t fsz = lseek64(fd, 0, SEEK_END); + + if (fsz == -1) { + std::cerr << "Failed to calculate file size" << std::endl; + exit(1); + } + const uint8_t *begin = reinterpret_cast( - mmap(NULL, fsz, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0)); + mmap(NULL, fsz, PROT_READ, MMAP_FLAGS, fd, 0)); char letters[256]; for (size_t i = 0; i < 256; ++i) { diff --git a/src/freq03.cpp b/src/freq03.cpp index a3e58a3..a4507e8 100644 --- a/src/freq03.cpp +++ b/src/freq03.cpp @@ -16,6 +16,23 @@ #define __forceinline __attribute__((always_inline)) inline #endif +#if __APPLE__ +#define lseek64 lseek +#endif + +#if __linux__ +#include +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) +#define _MAP_POPULATE_AVAILABLE +#endif +#endif + +#ifdef _MAP_POPULATE_AVAILABLE +#define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) +#else +#define MMAP_FLAGS MAP_PRIVATE +#endif + typedef unsigned char byte; typedef unsigned int uint; typedef unsigned long long ullong; @@ -184,7 +201,7 @@ int main(int argc, char **argv) { const ullong fsz = lseek64(fd, 0, SEEK_END); const byte *fbegin = - (const byte *)mmap(NULL, fsz, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0); + (const byte *)mmap(NULL, fsz, PROT_READ, MMAP_FLAGS, fd, 0); static byte lc[256]; memset(lc, 0, sizeof(lc));