Skip to main content
aboutsummaryrefslogtreecommitdiffstats
blob: d0e8b887cefdb1b0ed4580b4fb1bc111cb990795 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
/*
 * Copyright (C) 2010, Google Inc.
 * and other copyright owners as documented in the project's IP log.
 *
 * This program and the accompanying materials are made available
 * under the terms of the Eclipse Distribution License v1.0 which
 * accompanies this distribution, is reproduced below, and is
 * available at http://www.eclipse.org/org/documents/edl-v10.php
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 *
 * - Neither the name of the Eclipse Foundation, Inc. nor the
 *   names of its contributors may be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.eclipse.jgit.diff;

import org.eclipse.jgit.internal.JGitText;

/**
 * Support {@link HistogramDiff} by computing occurrence counts of elements.
 *
 * Each element in the range being considered is put into a hash table, tracking
 * the number of times that distinct element appears in the sequence. Once all
 * elements have been inserted from sequence A, each element of sequence B is
 * probed in the hash table and the longest common subsequence with the lowest
 * occurrence count in A is used as the result.
 *
 * @param <S>
 *            type of the base sequence.
 */
final class HistogramDiffIndex<S extends Sequence> {
	private static final int REC_NEXT_SHIFT = 28 + 8;

	private static final int REC_PTR_SHIFT = 8;

	private static final int REC_PTR_MASK = (1 << 28) - 1;

	private static final int REC_CNT_MASK = (1 << 8) - 1;

	private static final int MAX_PTR = REC_PTR_MASK;

	private static final int MAX_CNT = (1 << 8) - 1;

	private final int maxChainLength;

	private final HashedSequenceComparator<S> cmp;

	private final HashedSequence<S> a;

	private final HashedSequence<S> b;

	private final Edit region;

	/** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
	private final int[] table;

	/** Number of low bits to discard from a key to index {@link #table}. */
	private final int keyShift;

	/**
	 * Describes a unique element in sequence A.
	 *
	 * The records in this table are actually 3-tuples of:
	 * <ul>
	 * <li>index of next record in this table that has same hash code</li>
	 * <li>index of first element in this occurrence chain</li>
	 * <li>occurrence count for this element (length of locs list)</li>
	 * </ul>
	 *
	 * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
	 * a few bits wide. Elements that occur more frequently will have their
	 * count capped.
	 */
	private long[] recs;

	/** Number of elements in {@link #recs}; also is the unique element count. */
	private int recCnt;

	/**
	 * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
	 *
	 * For the sequence element {@code ptr}, the value stored at location
	 * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
	 * element in the sequence.
	 *
	 * Chains always run from the lowest index to the largest index. Therefore
	 * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
	 * This allows a chain to terminate with {@code 0}, as {@code 0} would never
	 * be a valid next element.
	 *
	 * The array is sized to be {@code region.getLengthA()} and element indexes
	 * are converted to array indexes by subtracting {@link #ptrShift}, which is
	 * just a cached version of {@code region.beginA}.
	 */
	private int[] next;

	/**
	 * For element {@code ptr} in A, index of the record in {@link #recs} array.
	 *
	 * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
	 * describing all occurrences of the element appearing in sequence A at
	 * position {@code ptr}. The record is needed to get the occurrence count of
	 * the element, or to locate all other occurrences of that element within
	 * sequence A. This index provides constant-time access to the record, and
	 * avoids needing to scan the hash chain.
	 */
	private int[] recIdx;

	/** Value to subtract from element indexes to key {@link #next} array. */
	private int ptrShift;

	private Edit lcs;

	private int cnt;

	private boolean hasCommon;

	HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
			HashedSequence<S> a, HashedSequence<S> b, Edit r) {
		this.maxChainLength = maxChainLength;
		this.cmp = cmp;
		this.a = a;
		this.b = b;
		this.region = r;

		if (region.endA >= MAX_PTR)
			throw new IllegalArgumentException(
					JGitText.get().sequenceTooLargeForDiffAlgorithm);

		final int sz = r.getLengthA();
		final int tableBits = tableBits(sz);
		table = new int[1 << tableBits];
		keyShift = 32 - tableBits;
		ptrShift = r.beginA;

		recs = new long[Math.max(4, sz >>> 3)];
		next = new int[sz];
		recIdx = new int[sz];
	}

	Edit findLongestCommonSequence() {
		if (!scanA())
			return null;

		lcs = new Edit(0, 0);
		cnt = maxChainLength + 1;

		for (int bPtr = region.beginB; bPtr < region.endB;)
			bPtr = tryLongestCommonSequence(bPtr);

		return hasCommon && maxChainLength < cnt ? null : lcs;
	}

	private boolean scanA() {
		// Scan the elements backwards, inserting them into the hash table
		// as we go. Going in reverse places the earliest occurrence of any
		// element at the start of the chain, so we consider earlier matches
		// before later matches.
		//
		SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
			final int tIdx = hash(a, ptr);

			int chainLen = 0;
			for (int rIdx = table[tIdx]; rIdx != 0;) {
				final long rec = recs[rIdx];
				if (cmp.equals(a, recPtr(rec), a, ptr)) {
					// ptr is identical to another element. Insert it onto
					// the front of the existing element chain.
					//
					int newCnt = recCnt(rec) + 1;
					if (MAX_CNT < newCnt)
						newCnt = MAX_CNT;
					recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
					next[ptr - ptrShift] = recPtr(rec);
					recIdx[ptr - ptrShift] = rIdx;
					continue SCAN;
				}

				rIdx = recNext(rec);
				chainLen++;
			}

			if (chainLen == maxChainLength)
				return false;

			// This is the first time we have ever seen this particular
			// element in the sequence. Construct a new chain for it.
			//
			final int rIdx = ++recCnt;
			if (rIdx == recs.length) {
				int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
				long[] n = new long[sz];
				System.arraycopy(recs, 0, n, 0, recs.length);
				recs = n;
			}

			recs[rIdx] = recCreate(table[tIdx], ptr, 1);
			recIdx[ptr - ptrShift] = rIdx;
			table[tIdx] = rIdx;
		}
		return true;
	}

	private int tryLongestCommonSequence(final int bPtr) {
		int bNext = bPtr + 1;
		int rIdx = table[hash(b, bPtr)];
		for (long rec; rIdx != 0; rIdx = recNext(rec)) {
			rec = recs[rIdx];

			// If there are more occurrences in A, don't use this chain.
			if (recCnt(rec) > cnt) {
				if (!hasCommon)
					hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
				continue;
			}

			int as = recPtr(rec);
			if (!cmp.equals(a, as, b, bPtr))
				continue;

			hasCommon = true;
			TRY_LOCATIONS: for (;;) {
				int np = next[as - ptrShift];
				int bs = bPtr;
				int ae = as + 1;
				int be = bs + 1;
				int rc = recCnt(rec);

				while (region.beginA < as && region.beginB < bs
						&& cmp.equals(a, as - 1, b, bs - 1)) {
					as--;
					bs--;
					if (1 < rc)
						rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
				}
				while (ae < region.endA && be < region.endB
						&& cmp.equals(a, ae, b, be)) {
					if (1 < rc)
						rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
					ae++;
					be++;
				}

				if (bNext < be)
					bNext = be;
				if (lcs.getLengthA() < ae - as || rc < cnt) {
					// If this region is the longest, or there are less
					// occurrences of it in A, its now our LCS.
					//
					lcs.beginA = as;
					lcs.beginB = bs;
					lcs.endA = ae;
					lcs.endB = be;
					cnt = rc;
				}

				// Because we added elements in reverse order index 0
				// cannot possibly be the next position. Its the first
				// element of the sequence and thus would have been the
				// value of as at the start of the TRY_LOCATIONS loop.
				//
				if (np == 0)
					break TRY_LOCATIONS;

				while (np < ae) {
					// The next location to consider was actually within
					// the LCS we examined above. Don't reconsider it.
					//
					np = next[np - ptrShift];
					if (np == 0)
						break TRY_LOCATIONS;
				}

				as = np;
			}
		}
		return bNext;
	}

	private int hash(HashedSequence<S> s, int idx) {
		return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
	}

	private static long recCreate(int next, int ptr, int cnt) {
		return ((long) next << REC_NEXT_SHIFT) //
				| ((long) ptr << REC_PTR_SHIFT) //
				| cnt;
	}

	private static int recNext(long rec) {
		return (int) (rec >>> REC_NEXT_SHIFT);
	}

	private static int recPtr(long rec) {
		return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
	}

	private static int recCnt(long rec) {
		return ((int) rec) & REC_CNT_MASK;
	}

	private static int tableBits(final int sz) {
		int bits = 31 - Integer.numberOfLeadingZeros(sz);
		if (bits == 0)
			bits = 1;
		if (1 << bits < sz)
			bits++;
		return bits;
	}
}

Back to the top