Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ junk/
# linux binaries
bin/freq*
bin/hack*

# osx trash
*.dSYM
12 changes: 12 additions & 0 deletions build/hopscotch/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps
freq
13 changes: 13 additions & 0 deletions build/hopscotch/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
cmake_minimum_required(VERSION 3.2)

project(freq)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O3")

set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR})

add_subdirectory(hopscotch-map)

add_executable(freq src/main.cpp)
target_link_libraries(freq PRIVATE tsl::hopscotch_map)
1 change: 1 addition & 0 deletions build/hopscotch/hopscotch-map
Submodule hopscotch-map added at 29030f
165 changes: 165 additions & 0 deletions build/hopscotch/src/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*

clang++ -std=c++ -O3 src/freq04.cpp -o freq

*/

#include <cstdint>
#include <algorithm>
#include <string>
#include <functional>
#include <tsl/hopscotch_map.h>
#include <tsl/hopscotch_set.h>

#include <iostream>
#include <fcntl.h> // open

#if defined(_WIN32) || defined(WIN32)
// windows
#include "ext/windows-mmap.h"
#include <io.h>
#define lseek64 _lseeki64
#pragma warning(disable : 4996)
#else
// unix
#include <sys/mman.h> // mmap
#include <unistd.h> // lseek
#endif

#if __APPLE__
#define lseek64 lseek
#endif

#if __linux__
#include <linux/version.h>
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
#define _MAP_POPULATE_AVAILABLE
#endif
#endif

#ifdef _MAP_POPULATE_AVAILABLE
#define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE)
#else
#define MMAP_FLAGS MAP_PRIVATE
#endif

typedef unsigned char byte;
typedef unsigned long long ullong;

class Word {
public:
Word(const std::string &s, int i) : text(s), occurances(i) {}

std::string text;
int occurances;
};

int main(int argc, char **argv) {
if (argc != 3) {
exit(1);
}

int fd = open(argv[1], O_RDONLY);
if (fd == -1) {
std::cerr << "Can't open file" << std::endl;
exit(1);
}

FILE *out = fopen(argv[2], "wb+");
if (!out) {
std::cerr << "failed to open output file" << std::endl;
exit(1);
}

const size_t fsz = lseek(fd, 0, SEEK_END);
if (fsz == -1) {
std::cerr << "Failed to calculate file size" << std::endl;
exit(1);
}


const uint8_t *begin = reinterpret_cast<const uint8_t *>(
mmap(NULL, fsz, PROT_READ, MMAP_FLAGS, fd, 0));

if (begin == nullptr || begin == MAP_FAILED) {
std::cerr << "Unable to mmap" << std::endl;
exit(1);
}

std::cout << "file mmap'ed" << std::endl;

char letters[256];
memset(letters, 0, sizeof(letters));
for (size_t c = 0; c < 256; ++c) {
if (c >= 'a' and c <= 'z') {
letters[c] = c;
}
if (c >= 'A' and c <= 'Z') {
letters[c] = c - 'A' + 'a';
}
}

std::cout << "letters: " << std::string(letters, 256) << std::endl;

const char *start = nullptr;
size_t len = 0;

/*
* Calculating the hash and comparing two std::string may be slow.
* We can store the hash of each std::string in the hash map to make
* the inserts and lookups faster by setting StoreHash to true.
*/
tsl::hopscotch_map<std::string, int, std::hash<std::string>,
std::equal_to<std::string>,
std::allocator<std::pair<std::string, int>>,
30, true> map;

auto end = begin + fsz;
auto word = std::string(start, len);

for (auto s = begin; s != end; ++s) {
const auto ch = letters[*s];

if (ch) {
if (not start) {
start = reinterpret_cast<const char*>(s);
}
++len;
continue;
} else if (len == 0) {
continue;
} else {
word.assign(start, len);

for (size_t i = 0; i< len; ++i) {
word[i] = letters[word[i]];
}

if (map.find(word) != map.end()) {
map[word]++;
} else {
map[word] = 1;
}
}

len = 0;
start = nullptr;
}

std::vector<Word> words;
words.reserve(map.size());

for(const auto& key_value : map) {
// TODO string copied here?..
words.emplace_back(Word(key_value.first, key_value.second));
}


std::sort(words.begin(), words.end(), [](const Word& a, const Word& b) -> bool {
return a.occurances > b.occurances;
});

for (const auto& word : words) {
fprintf(out, "%d %s\n", word.occurances, word.text.c_str());
}
}
25 changes: 24 additions & 1 deletion src/freq01.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,23 @@
#include <unistd.h>
#endif

#if __APPLE__
#define lseek64 lseek
#endif

#if __linux__
#include <linux/version.h>
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
#define _MAP_POPULATE_AVAILABLE
#endif
#endif

#ifdef _MAP_POPULATE_AVAILABLE
#define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE)
#else
#define MMAP_FLAGS MAP_PRIVATE
#endif

constexpr uint32_t H = 2166136261;

int usage(char *process_name) {
Expand Down Expand Up @@ -88,8 +105,14 @@ int main(int argc, char **argv) {
}

const size_t fsz = lseek64(fd, 0, SEEK_END);

if (fsz == -1) {
std::cerr << "Failed to calculate file size" << std::endl;
exit(1);
}

const uint8_t *begin = reinterpret_cast<const uint8_t *>(
mmap(NULL, fsz, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0));
mmap(NULL, fsz, PROT_READ, MMAP_FLAGS, fd, 0));

char letters[256];
for (size_t i = 0; i < 256; ++i) {
Expand Down
19 changes: 18 additions & 1 deletion src/freq03.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@
#define __forceinline __attribute__((always_inline)) inline
#endif

#if __APPLE__
#define lseek64 lseek
#endif

#if __linux__
#include <linux/version.h>
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
#define _MAP_POPULATE_AVAILABLE
#endif
#endif

#ifdef _MAP_POPULATE_AVAILABLE
#define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE)
#else
#define MMAP_FLAGS MAP_PRIVATE
#endif

typedef unsigned char byte;
typedef unsigned int uint;
typedef unsigned long long ullong;
Expand Down Expand Up @@ -184,7 +201,7 @@ int main(int argc, char **argv) {

const ullong fsz = lseek64(fd, 0, SEEK_END);
const byte *fbegin =
(const byte *)mmap(NULL, fsz, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
(const byte *)mmap(NULL, fsz, PROT_READ, MMAP_FLAGS, fd, 0);

static byte lc[256];
memset(lc, 0, sizeof(lc));
Expand Down