diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a7e4ed..9e2f4da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,3 +34,8 @@ add_executable(test_many_commits tests/test_many_commits.c) target_link_libraries(test_many_commits bup_odb ${LIBGIT2_LIBRARIES}) add_test(NAME test_many_commits COMMAND test_many_commits) set_tests_properties(test_many_commits PROPERTIES WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_executable(test_repack_fsck tests/test_repack_fsck.c) +target_link_libraries(test_repack_fsck bup_odb ${LIBGIT2_LIBRARIES}) +add_test(NAME test_repack_fsck COMMAND test_repack_fsck) +set_tests_properties(test_repack_fsck PROPERTIES WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/src/git2.c b/src/git2.c index 7b82cd4..d2a9da4 100644 --- a/src/git2.c +++ b/src/git2.c @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include static int cmd_hash_object(const char *file) { @@ -257,6 +260,260 @@ static int cmd_commit(const char *repo_path, const char *message) return ret; } +static int walk_tree(git_repository *repo, git_tree *tree) +{ + size_t count = git_tree_entrycount(tree); + for (size_t i = 0; i < count; i++) { + const git_tree_entry *entry = git_tree_entry_byindex(tree, i); + git_object *obj = NULL; + int ret = git_tree_entry_to_object(&obj, repo, entry); + if (ret < 0) + return ret; + if (git_object_type(obj) == GIT_OBJECT_TREE) { + ret = walk_tree(repo, (git_tree *)obj); + git_object_free(obj); + if (ret < 0) + return ret; + } else { + git_object_free(obj); + } + } + return 0; +} + +static int cmd_fsck(const char *repo_path) +{ + git_repository *repo = NULL; + int ret = git_repository_open(&repo, repo_path); + if (ret < 0) + return ret; + + git_odb *odb = NULL; + git_repository_odb(&odb, repo); + + git_revwalk *walk = NULL; + ret = git_revwalk_new(&walk, repo); + if (ret < 0) + goto out; + git_revwalk_push_head(walk); + + git_oid oid; + while ((ret = git_revwalk_next(&oid, walk)) == 0) { + git_commit *commit = NULL; + if (git_commit_lookup(&commit, repo, &oid) < 0) { + ret = -1; + break; + } + git_tree *tree = NULL; + if (git_commit_tree(&tree, commit) < 0) { + git_commit_free(commit); + ret = -1; + break; + } + ret = walk_tree(repo, tree); + git_tree_free(tree); + git_commit_free(commit); + if (ret < 0) + break; + } + + if (ret == GIT_ITEROVER) + ret = 0; + + git_revwalk_free(walk); +out: + git_odb_free(odb); + git_repository_free(repo); + return ret; +} + +typedef struct { + git_oid *oids; + size_t count; + size_t cap; +} oid_list; + +static int oid_list_add(oid_list *list, const git_oid *oid) +{ + for (size_t i = 0; i < list->count; i++) + if (git_oid_cmp(&list->oids[i], oid) == 0) + return 0; + if (list->count == list->cap) { + size_t new_cap = list->cap ? list->cap * 2 : 32; + git_oid *tmp = realloc(list->oids, new_cap * sizeof(git_oid)); + if (!tmp) + return -1; + list->oids = tmp; + list->cap = new_cap; + } + git_oid_cpy(&list->oids[list->count++], oid); + return 0; +} + +static int collect_tree_oids(git_repository *repo, git_tree *tree, oid_list *list) +{ + size_t count = git_tree_entrycount(tree); + for (size_t i = 0; i < count; i++) { + const git_tree_entry *entry = git_tree_entry_byindex(tree, i); + const git_oid *oid = git_tree_entry_id(entry); + if (oid_list_add(list, oid) < 0) + return -1; + if (git_tree_entry_type(entry) == GIT_OBJECT_TREE) { + git_object *obj = NULL; + if (git_tree_entry_to_object(&obj, repo, entry) < 0) + return -1; + int ret = collect_tree_oids(repo, (git_tree *)obj, list); + git_object_free(obj); + if (ret < 0) + return ret; + } + } + return 0; +} + +static int collect_reachable_oids(git_repository *repo, oid_list *list) +{ + git_revwalk *walk = NULL; + int ret = git_revwalk_new(&walk, repo); + if (ret < 0) + return ret; + git_revwalk_push_head(walk); + + git_oid oid; + while ((ret = git_revwalk_next(&oid, walk)) == 0) { + if (oid_list_add(list, &oid) < 0) + break; + git_commit *commit = NULL; + if (git_commit_lookup(&commit, repo, &oid) < 0) { + ret = -1; + break; + } + git_tree *tree = NULL; + if (git_commit_tree(&tree, commit) < 0) { + git_commit_free(commit); + ret = -1; + break; + } + ret = collect_tree_oids(repo, tree, list); + git_tree_free(tree); + git_commit_free(commit); + if (ret < 0) + break; + } + git_revwalk_free(walk); + return ret == GIT_ITEROVER ? 0 : ret; +} + +static void remove_loose_objects(const char *repo_path) +{ + git_repository *repo = NULL; + if (git_repository_open(&repo, repo_path) < 0) + return; + + git_odb *odb = NULL; + if (git_repository_odb(&odb, repo) < 0) { + git_repository_free(repo); + return; + } + + oid_list keep = {0}; + if (collect_reachable_oids(repo, &keep) < 0) { + git_odb_free(odb); + git_repository_free(repo); + free(keep.oids); + return; + } + + char objdir[512]; + snprintf(objdir, sizeof(objdir), "%s/.git/objects", repo_path); + DIR *d = opendir(objdir); + if (!d) { + git_odb_free(odb); + git_repository_free(repo); + free(keep.oids); + return; + } + + struct dirent *ent; + char path[512]; + char file[512]; + char hex[41]; + git_oid oid; + + while ((ent = readdir(d))) { + if (strlen(ent->d_name) != 2) + continue; + snprintf(path, sizeof(path), "%s/%s", objdir, ent->d_name); + DIR *sd = opendir(path); + if (!sd) + continue; + struct dirent *ent2; + while ((ent2 = readdir(sd))) { + if (!strcmp(ent2->d_name, ".") || !strcmp(ent2->d_name, "..")) + continue; + snprintf(file, sizeof(file), "%s/%s", path, ent2->d_name); + snprintf(hex, sizeof(hex), "%s%s", ent->d_name, ent2->d_name); + if (git_oid_fromstr(&oid, hex) == 0) { + int keep_obj = 0; + for (size_t i = 0; i < keep.count; i++) { + if (git_oid_cmp(&keep.oids[i], &oid) == 0) { + keep_obj = 1; + break; + } + } + if (keep_obj) + unlink(file); + } + } + closedir(sd); + rmdir(path); /* ignore failure if not empty */ + } + + closedir(d); + free(keep.oids); + git_odb_free(odb); + git_repository_free(repo); +} + +static int cmd_repack(const char *repo_path) +{ + git_repository *repo = NULL; + int ret = git_repository_open(&repo, repo_path); + if (ret < 0) + return ret; + + git_packbuilder *pb = NULL; + ret = git_packbuilder_new(&pb, repo); + if (ret < 0) + goto out_repo; + + git_revwalk *walk = NULL; + ret = git_revwalk_new(&walk, repo); + if (ret < 0) + goto out_pb; + git_revwalk_push_head(walk); + ret = git_packbuilder_insert_walk(pb, walk); + git_revwalk_free(walk); + if (ret < 0) + goto out_pb; + + ret = git_packbuilder_write(pb, NULL, 0, NULL, NULL); + git_packbuilder_free(pb); + pb = NULL; + git_repository_free(repo); + repo = NULL; + if (ret == 0) + remove_loose_objects(repo_path); + +out_pb: + if (pb) + git_packbuilder_free(pb); +out_repo: + if (repo) + git_repository_free(repo); + return ret; +} + int main(int argc, char **argv) { git_libgit2_init(); @@ -320,6 +577,20 @@ int main(int argc, char **argv) } else { ret = cmd_show(repo_path, argv[arg]); } + } else if (strcmp(cmd, "repack") == 0) { + if (!repo_path) { + fprintf(stderr, "repack requires -C \n"); + ret = 1; + } else { + ret = cmd_repack(repo_path); + } + } else if (strcmp(cmd, "fsck") == 0) { + if (!repo_path) { + fprintf(stderr, "fsck requires -C \n"); + ret = 1; + } else { + ret = cmd_fsck(repo_path); + } } else { fprintf(stderr, "Unknown command %s\n", cmd); ret = 1; diff --git a/tests/test_repack_fsck.c b/tests/test_repack_fsck.c new file mode 100644 index 0000000..ae2b51d --- /dev/null +++ b/tests/test_repack_fsck.c @@ -0,0 +1,277 @@ +#include "bup_odb.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FILE_SIZE 100000 +#define CHANGE_BLOCK 10 +#define NUM_VERSIONS 100 +#define REPO_TEMPLATE "repack_repoXXXXXX" +#define FILE_NAME "file.bin" + +static const char *detect_cli(void) +{ + return "./git2"; +} + +static void fill_random(char *buf, size_t len) +{ + for (size_t i = 0; i < len; i++) + buf[i] = (char)(rand() % 256); +} + +static void commit_file(const char *cli, const char *repo, const char *msg) +{ + char cmd[512]; + snprintf(cmd, sizeof(cmd), "%s -C %s add %s", cli, repo, FILE_NAME); + assert(system(cmd) == 0); + snprintf(cmd, sizeof(cmd), "%s -C %s commit -m '%s'", cli, repo, msg); + assert(system(cmd) == 0); +} + +static void verify_blob(const char *cli, const char *repo, const char *spec, + const char *data, size_t len) +{ + char cmd[512]; + snprintf(cmd, sizeof(cmd), "%s -C %s show %s", cli, repo, spec); + FILE *p = popen(cmd, "r"); + assert(p); + char *buf = malloc(len); + size_t r = fread(buf, 1, len, p); + assert(r == len); + int c = fgetc(p); + assert(c == EOF); + pclose(p); + assert(memcmp(buf, data, len) == 0); + free(buf); +} + +static size_t store_blob_get_chunks(git_odb_backend *backend, const void *data, + size_t len, git_oid *oid, git_oid **chunks, + size_t **lens) +{ + assert(backend->write(backend, oid, data, len, GIT_OBJECT_BLOB) == 0); + return bup_backend_object_chunk_count(backend, oid, chunks, lens); +} + +static size_t count_reused(const git_oid *new_chunks, size_t new_count, + const git_oid *old_chunks, size_t old_count) +{ + size_t reused = 0; + for (size_t i = 0; i < new_count; i++) { + for (size_t j = 0; j < old_count; j++) { + if (git_oid_cmp(&new_chunks[i], &old_chunks[j]) == 0) { + reused++; + break; + } + } + } + return reused; +} + +static long long dir_size(const char *path) +{ + struct stat st; + if (lstat(path, &st) < 0) + return 0; + long long sum = S_ISDIR(st.st_mode) ? 0 : st.st_size; + if (!S_ISDIR(st.st_mode)) + return sum; + DIR *d = opendir(path); + if (!d) + return sum; + struct dirent *ent; + while ((ent = readdir(d))) { + if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) + continue; + char buf[512]; + snprintf(buf, sizeof(buf), "%s/%s", path, ent->d_name); + sum += dir_size(buf); + } + closedir(d); + return sum; +} + +static size_t count_pack_files(const char *repo) +{ + char path[512]; + snprintf(path, sizeof(path), "%s/.git/objects/pack", repo); + size_t count = 0; + DIR *d = opendir(path); + if (!d) + return 0; + struct dirent *ent; + while ((ent = readdir(d))) { + if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) + continue; + if (strstr(ent->d_name, ".pack")) + count++; + } + closedir(d); + return count; +} + +static size_t count_loose_objects(const char *repo) +{ + char objpath[512]; + snprintf(objpath, sizeof(objpath), "%s/.git/objects", repo); + DIR *d = opendir(objpath); + if (!d) + return 0; + size_t count = 0; + struct dirent *ent; + while ((ent = readdir(d))) { + if (strlen(ent->d_name) != 2) + continue; + char subdir[512]; + snprintf(subdir, sizeof(subdir), "%s/%s", objpath, ent->d_name); + DIR *sd = opendir(subdir); + if (!sd) + continue; + struct dirent *ent2; + while ((ent2 = readdir(sd))) { + if (!strcmp(ent2->d_name, ".") || !strcmp(ent2->d_name, "..")) + continue; + count++; + } + closedir(sd); + } + closedir(d); + return count; +} + +int main(void) +{ + git_libgit2_init(); + srand(1234); + + const char *cli = detect_cli(); + git_odb_backend *backend = NULL; + + char repo_tmp[] = REPO_TEMPLATE; + char *repo = mkdtemp(repo_tmp); + assert(repo); + + char cmd[512]; + snprintf(cmd, sizeof(cmd), "%s init %s", cli, repo); + assert(system(cmd) == 0); + + assert(bup_odb_backend_new(&backend, repo) == 0); + + setenv("GIT_AUTHOR_NAME", "Tester", 1); + setenv("GIT_AUTHOR_EMAIL", "tester@example.com", 1); + setenv("GIT_COMMITTER_NAME", "Tester", 1); + setenv("GIT_COMMITTER_EMAIL", "tester@example.com", 1); + + char filepath[512]; + snprintf(filepath, sizeof(filepath), "%s/%s", repo, FILE_NAME); + + char *versions[NUM_VERSIONS]; + char *data = malloc(FILE_SIZE); + fill_random(data, FILE_SIZE); + + git_oid *chunks = NULL; + size_t *lens = NULL; + git_oid oid; + + FILE *f = fopen(filepath, "wb"); + assert(f); + fwrite(data, 1, FILE_SIZE, f); + fclose(f); + commit_file(cli, repo, "ver 0"); + versions[0] = malloc(FILE_SIZE); + memcpy(versions[0], data, FILE_SIZE); + size_t chunk_count = + store_blob_get_chunks(backend, data, FILE_SIZE, &oid, &chunks, &lens); + long long git_size = dir_size(repo); + printf("initial reused=%zu unique=%zu git_size=%lld\n", chunk_count, 0UL, + git_size); + + for (int i = 1; i < NUM_VERSIONS; i++) { + size_t off = rand() % (FILE_SIZE - CHANGE_BLOCK + 1); + fill_random(data + off, CHANGE_BLOCK); + + f = fopen(filepath, "wb"); + assert(f); + fwrite(data, 1, FILE_SIZE, f); + fclose(f); + + char msg[64]; + snprintf(msg, sizeof(msg), "ver %d", i); + commit_file(cli, repo, msg); + + git_oid *new_chunks = NULL; + size_t *new_lens = NULL; + git_oid new_oid; + size_t new_count = store_blob_get_chunks(backend, data, FILE_SIZE, + &new_oid, &new_chunks, + &new_lens); + size_t reused = + count_reused(new_chunks, new_count, chunks, chunk_count); + size_t unique = new_count - reused; + git_size = dir_size(repo); + printf("iter=%d reused=%zu unique=%zu git_size=%lld\n", i, reused, + unique, git_size); + + free(chunks); + free(lens); + chunks = new_chunks; + lens = new_lens; + chunk_count = new_count; + + versions[i] = malloc(FILE_SIZE); + memcpy(versions[i], data, FILE_SIZE); + } + + long long size_before = dir_size(repo); + size_t pack_before = count_pack_files(repo); + size_t loose_before = count_loose_objects(repo); + printf("size_before_pack=%lld pack_files_before=%zu loose_before=%zu\n", + size_before, pack_before, loose_before); + + snprintf(cmd, sizeof(cmd), "%s -C %s repack", cli, repo); + assert(system(cmd) == 0); + long long size_after = dir_size(repo); + size_t pack_after = count_pack_files(repo); + size_t loose_after = count_loose_objects(repo); + for (int i = 0; loose_after && i < 10; i++) { + usleep(100000); + loose_after = count_loose_objects(repo); + } + printf( + "size_after_pack=%lld pack_files_after=%zu loose_after=%zu\n", + size_after, pack_after, loose_after); + assert(pack_before == 0); + assert(pack_after == 1); + assert(loose_before > 0); + assert(loose_after < loose_before); + snprintf(cmd, sizeof(cmd), "%s -C %s fsck", cli, repo); + system(cmd); + + for (int i = 0; i < NUM_VERSIONS; i++) { + int rev = NUM_VERSIONS - 1 - i; + char spec[64]; + if (rev == 0) + snprintf(spec, sizeof(spec), "HEAD:%s", FILE_NAME); + else + snprintf(spec, sizeof(spec), "HEAD~%d:%s", rev, FILE_NAME); + verify_blob(cli, repo, spec, versions[i], FILE_SIZE); + free(versions[i]); + } + + free(chunks); + free(lens); + backend->free(backend); + + snprintf(cmd, sizeof(cmd), "rm -rf %s", repo); + system(cmd); + free(data); + git_libgit2_shutdown(); + return 0; +}