/* Generic infrastructure to implement various diff algorithms (implementation). */ /* * Copyright (c) 2020 Neels Hofmeyr * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "diff_internal.h" #include "diff_debug.h" inline enum diff_chunk_type diff_chunk_type(const struct diff_chunk *chunk) { if (!chunk->left_count && !chunk->right_count) return CHUNK_EMPTY; if (!chunk->solved) return CHUNK_ERROR; if (!chunk->right_count) return CHUNK_MINUS; if (!chunk->left_count) return CHUNK_PLUS; if (chunk->left_count != chunk->right_count) return CHUNK_ERROR; return CHUNK_SAME; } static int read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len) { int r; if (fseeko(f, at_pos, SEEK_SET) == -1) return errno; r = fread(buf, sizeof(char), len, f); if ((r == 0 || r < len) && ferror(f)) return EIO; if (r != len) return EIO; return 0; } static int buf_cmp(const unsigned char *left, size_t left_len, const unsigned char *right, size_t right_len, bool ignore_whitespace) { int cmp; if (ignore_whitespace) { int il = 0, ir = 0; while (il < left_len && ir < right_len) { unsigned char cl = left[il]; unsigned char cr = right[ir]; if (isspace((unsigned char)cl) && il < left_len) { il++; continue; } if (isspace((unsigned char)cr) && ir < right_len) { ir++; continue; } if (cl > cr) return 1; if (cr > cl) return -1; il++; ir++; } while (il < left_len) { unsigned char cl = left[il++]; if (!isspace((unsigned char)cl)) return 1; } while (ir < right_len) { unsigned char cr = right[ir++]; if (!isspace((unsigned char)cr)) return -1; } return 0; } cmp = memcmp(left, right, MIN(left_len, right_len)); if (cmp) return cmp; if (left_len == right_len) return 0; return (left_len > right_len) ? 1 : -1; } int diff_atom_cmp(int *cmp, const struct diff_atom *left, const struct diff_atom *right) { off_t remain_left, remain_right; int flags = (left->root->diff_flags | right->root->diff_flags); bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE); if (!left->len && !right->len) { *cmp = 0; return 0; } if (!ignore_whitespace) { if (!right->len) { *cmp = 1; return 0; } if (!left->len) { *cmp = -1; return 0; } } if (left->at != NULL && right->at != NULL) { *cmp = buf_cmp(left->at, left->len, right->at, right->len, ignore_whitespace); return 0; } remain_left = left->len; remain_right = right->len; while (remain_left > 0 || remain_right > 0) { const size_t chunksz = 8192; unsigned char buf_left[chunksz], buf_right[chunksz]; const uint8_t *p_left, *p_right; off_t n_left, n_right; ssize_t r; if (!remain_right) { *cmp = 1; return 0; } if (!remain_left) { *cmp = -1; return 0; } n_left = MIN(chunksz, remain_left); n_right = MIN(chunksz, remain_right); if (left->at == NULL) { r = read_at(left->root->f, left->pos + (left->len - remain_left), buf_left, n_left); if (r) { *cmp = 0; return r; } p_left = buf_left; } else { p_left = left->at + (left->len - remain_left); } if (right->at == NULL) { r = read_at(right->root->f, right->pos + (right->len - remain_right), buf_right, n_right); if (r) { *cmp = 0; return r; } p_right = buf_right; } else { p_right = right->at + (right->len - remain_right); } r = buf_cmp(p_left, n_left, p_right, n_right, ignore_whitespace); if (r) { *cmp = r; return 0; } remain_left -= n_left; remain_right -= n_right; } *cmp = 0; return 0; } int diff_atom_same(bool *same, const struct diff_atom *left, const struct diff_atom *right) { int cmp; int r; if (left->hash != right->hash) { *same = false; return 0; } r = diff_atom_cmp(&cmp, left, right); if (r) { *same = true; return r; } *same = (cmp == 0); return 0; } static struct diff_chunk * diff_state_add_solved_chunk(struct diff_state *state, const struct diff_chunk *chunk) { diff_chunk_arraylist_t *result; struct diff_chunk *new_chunk; enum diff_chunk_type last_t; enum diff_chunk_type new_t; struct diff_chunk *last; /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk * never directly follows a plus chunk. */ result = &state->result->chunks; last_t = result->len ? diff_chunk_type(&result->head[result->len - 1]) : CHUNK_EMPTY; new_t = diff_chunk_type(chunk); debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED", result->len); debug("L\n"); debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count); debug("R\n"); debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count); if (result->len) { last = &result->head[result->len - 1]; assert(chunk->left_start == last->left_start + last->left_count); assert(chunk->right_start == last->right_start + last->right_count); } if (new_t == last_t) { new_chunk = &result->head[result->len - 1]; new_chunk->left_count += chunk->left_count; new_chunk->right_count += chunk->right_count; debug(" - added chunk touches previous one of same type, joined:\n"); debug("L\n"); debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); debug("R\n"); debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) { enum diff_chunk_type prev_last_t = result->len > 1 ? diff_chunk_type(&result->head[result->len - 2]) : CHUNK_EMPTY; /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk-> * Is the one before that also a minus? combine. */ if (prev_last_t == CHUNK_MINUS) { new_chunk = &result->head[result->len - 2]; new_chunk->left_count += chunk->left_count; new_chunk->right_count += chunk->right_count; debug(" - added minus-chunk follows plus-chunk," " put before that plus-chunk and joined" " with preceding minus-chunk:\n"); debug("L\n"); debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); debug("R\n"); debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); } else { ARRAYLIST_INSERT(new_chunk, *result, result->len - 1); if (!new_chunk) return NULL; *new_chunk = *chunk; /* The new minus chunk indicates to which position on * the right it corresponds, even though it doesn't add * any lines on the right. By moving above a plus chunk, * that position on the right has shifted. */ last = &result->head[result->len - 1]; new_chunk->right_start = last->right_start; debug(" - added minus-chunk follows plus-chunk," " put before that plus-chunk\n"); } /* That last_t == CHUNK_PLUS indicates to which position on the * left it corresponds, even though it doesn't add any lines on * the left. By inserting/extending the prev_last_t == * CHUNK_MINUS, that position on the left has shifted. */ last = &result->head[result->len - 1]; last->left_start = new_chunk->left_start + new_chunk->left_count; } else { ARRAYLIST_ADD(new_chunk, *result); if (!new_chunk) return NULL; *new_chunk = *chunk; } return new_chunk; } /* Even if a left or right side is empty, diff output may need to know the * position in that file. * So left_start or right_start must never be NULL -- pass left_count or * right_count as zero to indicate staying at that position without consuming * any lines. */ struct diff_chunk * diff_state_add_chunk(struct diff_state *state, bool solved, struct diff_atom *left_start, unsigned int left_count, struct diff_atom *right_start, unsigned int right_count) { struct diff_chunk *new_chunk; struct diff_chunk chunk = { .solved = solved, .left_start = left_start, .left_count = left_count, .right_start = right_start, .right_count = right_count, }; /* An unsolved chunk means store as intermediate result for later * re-iteration. * If there already are intermediate results, that means even a * following solved chunk needs to go to intermediate results, so that * it is later put in the final correct position in solved chunks. */ if (!solved || state->temp_result.len) { /* Append to temp_result */ debug("ADD %s chunk to temp result:\n", chunk.solved ? "solved" : "UNSOLVED"); debug("L\n"); debug_dump_atoms(&state->left, left_start, left_count); debug("R\n"); debug_dump_atoms(&state->right, right_start, right_count); ARRAYLIST_ADD(new_chunk, state->temp_result); if (!new_chunk) return NULL; *new_chunk = chunk; return new_chunk; } return diff_state_add_solved_chunk(state, &chunk); } static void diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data, unsigned long long len, int diff_flags) { *d = (struct diff_data){ .f = f, .pos = 0, .data = data, .len = len, .root = d, .diff_flags = diff_flags, }; } void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent, struct diff_atom *from_atom, unsigned int atoms_count) { struct diff_atom *last_atom; debug("diff_data %p parent %p from_atom %p atoms_count %u\n", d, parent, from_atom, atoms_count); debug(" from_atom "); debug_dump_atom(parent, NULL, from_atom); if (atoms_count == 0) { *d = (struct diff_data){ .f = NULL, .pos = 0, .data = NULL, .len = 0, .root = parent->root, .atoms.head = NULL, .atoms.len = atoms_count, }; return; } last_atom = from_atom + atoms_count - 1; *d = (struct diff_data){ .f = NULL, .pos = from_atom->pos, .data = from_atom->at, .len = (last_atom->pos + last_atom->len) - from_atom->pos, .root = parent->root, .atoms.head = from_atom, .atoms.len = atoms_count, }; debug("subsection:\n"); debug_dump(d); } void diff_data_free(struct diff_data *diff_data) { if (!diff_data) return; if (diff_data->atoms.allocated) ARRAYLIST_FREE(diff_data->atoms); } int diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state) { debug("\n** %s\n", __func__); debug("left:\n"); debug_dump(&state->left); debug("right:\n"); debug_dump(&state->right); debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL, 0); /* Add a chunk of equal lines, if any */ struct diff_atom *l = state->left.atoms.head; unsigned int l_len = state->left.atoms.len; struct diff_atom *r = state->right.atoms.head; unsigned int r_len = state->right.atoms.len; unsigned int equal_atoms_start = 0; unsigned int equal_atoms_end = 0; unsigned int l_idx = 0; unsigned int r_idx = 0; while (equal_atoms_start < l_len && equal_atoms_start < r_len) { int err; bool same; err = diff_atom_same(&same, &l[equal_atoms_start], &r[equal_atoms_start]); if (err) return err; if (!same) break; equal_atoms_start++; } while (equal_atoms_end < (l_len - equal_atoms_start) && equal_atoms_end < (r_len - equal_atoms_start)) { int err; bool same; err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end], &r[r_len - 1 - equal_atoms_end]); if (err) return err; if (!same) break; equal_atoms_end++; } /* Add a chunk of equal lines at the start */ if (equal_atoms_start) { if (!diff_state_add_chunk(state, true, l, equal_atoms_start, r, equal_atoms_start)) return ENOMEM; l_idx += equal_atoms_start; r_idx += equal_atoms_start; } /* Add a "minus" chunk with all lines from the left. */ if (equal_atoms_start + equal_atoms_end < l_len) { unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end; if (!diff_state_add_chunk(state, true, &l[l_idx], add_len, &r[r_idx], 0)) return ENOMEM; l_idx += add_len; } /* Add a "plus" chunk with all lines from the right. */ if (equal_atoms_start + equal_atoms_end < r_len) { unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end; if (!diff_state_add_chunk(state, true, &l[l_idx], 0, &r[r_idx], add_len)) return ENOMEM; r_idx += add_len; } /* Add a chunk of equal lines at the end */ if (equal_atoms_end) { if (!diff_state_add_chunk(state, true, &l[l_idx], equal_atoms_end, &r[r_idx], equal_atoms_end)) return ENOMEM; } return DIFF_RC_OK; } static int diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state) { int rc; if (!algo_config || !algo_config->impl || !state->recursion_depth_left || !state->left.atoms.len || !state->right.atoms.len) { debug("Fall back to diff_algo_none():%s%s%s\n", (!algo_config || !algo_config->impl) ? " no-cfg" : "", (!state->recursion_depth_left) ? " max-depth" : "", (!state->left.atoms.len || !state->right.atoms.len)? " trivial" : ""); return diff_algo_none(algo_config, state); } ARRAYLIST_FREE(state->temp_result); ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE); rc = algo_config->impl(algo_config, state); switch (rc) { case DIFF_RC_USE_DIFF_ALGO_FALLBACK: debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo); rc = diff_run_algo(algo_config->fallback_algo, state); goto return_rc; case DIFF_RC_OK: /* continue below */ break; default: /* some error happened */ goto return_rc; } /* Pick up any diff chunks that are still unsolved and feed to * inner_algo. inner_algo will solve unsolved chunks and append to * result, and subsequent solved chunks on this level are then appended * to result afterwards. */ int i; for (i = 0; i < state->temp_result.len; i++) { struct diff_chunk *c = &state->temp_result.head[i]; if (c->solved) { diff_state_add_solved_chunk(state, c); continue; } /* c is an unsolved chunk, feed to inner_algo */ struct diff_state inner_state = { .result = state->result, .recursion_depth_left = state->recursion_depth_left - 1, .kd_buf = state->kd_buf, .kd_buf_size = state->kd_buf_size, }; diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count); diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count); rc = diff_run_algo(algo_config->inner_algo, &inner_state); state->kd_buf = inner_state.kd_buf; state->kd_buf_size = inner_state.kd_buf_size; if (rc != DIFF_RC_OK) goto return_rc; } rc = DIFF_RC_OK; return_rc: ARRAYLIST_FREE(state->temp_result); return rc; } int diff_atomize_file(struct diff_data *d, const struct diff_config *config, FILE *f, const uint8_t *data, off_t len, int diff_flags) { if (!config->atomize_func) return EINVAL; diff_data_init_root(d, f, data, len, diff_flags); return config->atomize_func(config->atomize_func_data, d); } struct diff_result * diff_main(const struct diff_config *config, struct diff_data *left, struct diff_data *right) { struct diff_result *result = malloc(sizeof(struct diff_result)); if (!result) return NULL; *result = (struct diff_result){}; result->left = left; result->right = right; struct diff_state state = { .result = result, .recursion_depth_left = config->max_recursion_depth ? config->max_recursion_depth : UINT_MAX, .kd_buf = NULL, .kd_buf_size = 0, }; diff_data_init_subsection(&state.left, left, left->atoms.head, left->atoms.len); diff_data_init_subsection(&state.right, right, right->atoms.head, right->atoms.len); result->rc = diff_run_algo(config->algo, &state); free(state.kd_buf); return result; } void diff_result_free(struct diff_result *result) { if (!result) return; ARRAYLIST_FREE(result->chunks); free(result); } int diff_result_contains_printable_chunks(struct diff_result *result) { struct diff_chunk *c; enum diff_chunk_type t; int i; for (i = 0; i < result->chunks.len; i++) { c = &result->chunks.head[i]; t = diff_chunk_type(c); if (t == CHUNK_MINUS || t == CHUNK_PLUS) return 1; } return 0; }