Skip to main content
aboutsummaryrefslogtreecommitdiffstats
blob: 34581aefb5f8e5175ff4eaa40564430558afae11 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
/*
 * Copyright (C) 2010, Google Inc. and others
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package org.eclipse.jgit.diff;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectStream;

/**
 * Index structure of lines/blocks in one file.
 * <p>
 * This structure can be used to compute an approximation of the similarity
 * between two files. The index is used by
 * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
 * between files.
 * <p>
 * To save space in memory, this index uses a space efficient encoding which
 * will not exceed 1 MiB per instance. The index starts out at a smaller size
 * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
 * file are discovered.
 *
 * @since 4.0
 */
public class SimilarityIndex {
	/** A special {@link TableFullException} used in place of OutOfMemoryError. */
	public static final TableFullException
			TABLE_FULL_OUT_OF_MEMORY = new TableFullException();

	/**
	 * Shift to apply before storing a key.
	 * <p>
	 * Within the 64 bit table record space, we leave the highest bit unset so
	 * all values are positive. The lower 32 bits to count bytes.
	 */
	private static final int KEY_SHIFT = 32;

	/** Maximum value of the count field, also mask to extract the count. */
	private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;

	/**
	 * Total amount of bytes hashed into the structure, including \n. This is
	 * usually the size of the file minus number of CRLF encounters.
	 */
	private long hashedCnt;

	/** Number of non-zero entries in {@link #idHash}. */
	private int idSize;

	/** {@link #idSize} that triggers {@link #idHash} to double in size. */
	private int idGrowAt;

	/**
	 * Pairings of content keys and counters.
	 * <p>
	 * Slots in the table are actually two ints wedged into a single long. The
	 * upper 32 bits stores the content key, and the remaining lower bits stores
	 * the number of bytes associated with that key. Empty slots are denoted by
	 * 0, which cannot occur because the count cannot be 0. Values can only be
	 * positive, which we enforce during key addition.
	 */
	private long[] idHash;

	/** {@code idHash.length == 1 << idHashBits}. */
	private int idHashBits;

	/**
	 * Create a new similarity index for the given object
	 *
	 * @param obj
	 *            the object to hash
	 * @return similarity index for this object
	 * @throws java.io.IOException
	 *             file contents cannot be read from the repository.
	 * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
	 *             object hashing overflowed the storage capacity of the
	 *             SimilarityIndex.
	 */
	public static SimilarityIndex create(ObjectLoader obj) throws IOException,
			TableFullException {
		SimilarityIndex idx = new SimilarityIndex();
		idx.hash(obj);
		idx.sort();
		return idx;
	}

	SimilarityIndex() {
		idHashBits = 8;
		idHash = new long[1 << idHashBits];
		idGrowAt = growAt(idHashBits);
	}

	static boolean isBinary(ObjectLoader obj) throws IOException {
		if (obj.isLarge()) {
			try (ObjectStream in1 = obj.openStream()) {
				return RawText.isBinary(in1);
			}
		}
		byte[] raw = obj.getCachedBytes();
		return RawText.isBinary(raw, raw.length, true);
	}

	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
			TableFullException {
		if (obj.isLarge()) {
			hashLargeObject(obj);
		} else {
			byte[] raw = obj.getCachedBytes();
			hash(raw, 0, raw.length);
		}
	}

	private void hashLargeObject(ObjectLoader obj) throws IOException,
			TableFullException {
		boolean text;
		text = !isBinary(obj);

		try (ObjectStream in2 = obj.openStream()) {
			hash(in2, in2.getSize(), text);
		}
	}

	void hash(byte[] raw, int ptr, int end) throws TableFullException {
		final boolean text = !RawText.isBinary(raw, raw.length, true);
		hashedCnt = 0;
		while (ptr < end) {
			int hash = 5381;
			int blockHashedCnt = 0;
			int start = ptr;

			// Hash one line, or one block, whichever occurs first.
			do {
				int c = raw[ptr++] & 0xff;
				// Ignore CR in CRLF sequence if text
				if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
					continue;
				blockHashedCnt++;
				if (c == '\n')
					break;
				hash = (hash << 5) + hash + c;
			} while (ptr < end && ptr - start < 64);
			hashedCnt += blockHashedCnt;
			add(hash, blockHashedCnt);
		}
	}

	void hash(InputStream in, long remaining, boolean text) throws IOException,
			TableFullException {
		byte[] buf = new byte[4096];
		int ptr = 0;
		int cnt = 0;

		while (0 < remaining) {
			int hash = 5381;
			int blockHashedCnt = 0;

			// Hash one line, or one block, whichever occurs first.
			int n = 0;
			do {
				if (ptr == cnt) {
					ptr = 0;
					cnt = in.read(buf, 0, buf.length);
					if (cnt <= 0)
						throw new EOFException();
				}

				n++;
				int c = buf[ptr++] & 0xff;
				// Ignore CR in CRLF sequence if text
				if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
					continue;
				blockHashedCnt++;
				if (c == '\n')
					break;
				hash = (hash << 5) + hash + c;
			} while (n < 64 && n < remaining);
			hashedCnt += blockHashedCnt;
			add(hash, blockHashedCnt);
			remaining -= n;
		}
	}

	/**
	 * Sort the internal table so it can be used for efficient scoring.
	 * <p>
	 * Once sorted, additional lines/blocks cannot be added to the index.
	 */
	void sort() {
		// Sort the array. All of the empty space will wind up at the front,
		// because we forced all of the keys to always be positive. Later
		// we only work with the back half of the array.
		//
		Arrays.sort(idHash);
	}

	/**
	 * Compute the similarity score between this index and another.
	 * <p>
	 * A region of a file is defined as a line in a text file or a fixed-size
	 * block in a binary file. To prepare an index, each region in the file is
	 * hashed; the values and counts of hashes are retained in a sorted table.
	 * Define the similarity fraction F as the count of matching regions
	 * between the two files divided between the maximum count of regions in
	 * either file. The similarity score is F multiplied by the maxScore
	 * constant, yielding a range [0, maxScore]. It is defined as maxScore for
	 * the degenerate case of two empty files.
	 * <p>
	 * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
	 *
	 * @param dst
	 *            the other index
	 * @param maxScore
	 *            the score representing a 100% match
	 * @return the similarity score
	 */
	public int score(SimilarityIndex dst, int maxScore) {
		long max = Math.max(hashedCnt, dst.hashedCnt);
		if (max == 0)
			return maxScore;
		return (int) ((common(dst) * maxScore) / max);
	}

	long common(SimilarityIndex dst) {
		return common(this, dst);
	}

	private static long common(SimilarityIndex src, SimilarityIndex dst) {
		int srcIdx = src.packedIndex(0);
		int dstIdx = dst.packedIndex(0);
		long[] srcHash = src.idHash;
		long[] dstHash = dst.idHash;
		return common(srcHash, srcIdx, dstHash, dstIdx);
	}

	private static long common(long[] srcHash, int srcIdx, //
			long[] dstHash, int dstIdx) {
		if (srcIdx == srcHash.length || dstIdx == dstHash.length)
			return 0;

		long common = 0;
		int srcKey = keyOf(srcHash[srcIdx]);
		int dstKey = keyOf(dstHash[dstIdx]);

		for (;;) {
			if (srcKey == dstKey) {
				common += Math.min(countOf(srcHash[srcIdx]),
						countOf(dstHash[dstIdx]));

				if (++srcIdx == srcHash.length)
					break;
				srcKey = keyOf(srcHash[srcIdx]);

				if (++dstIdx == dstHash.length)
					break;
				dstKey = keyOf(dstHash[dstIdx]);

			} else if (srcKey < dstKey) {
				// Regions of src which do not appear in dst.
				if (++srcIdx == srcHash.length)
					break;
				srcKey = keyOf(srcHash[srcIdx]);

			} else /* if (dstKey < srcKey) */{
				// Regions of dst which do not appear in src.
				if (++dstIdx == dstHash.length)
					break;
				dstKey = keyOf(dstHash[dstIdx]);
			}
		}

		return common;
	}

	// Testing only
	int size() {
		return idSize;
	}

	// Testing only
	int key(int idx) {
		return keyOf(idHash[packedIndex(idx)]);
	}

	// Testing only
	long count(int idx) {
		return countOf(idHash[packedIndex(idx)]);
	}

	// Brute force approach only for testing.
	int findIndex(int key) {
		for (int i = 0; i < idSize; i++)
			if (key(i) == key)
				return i;
		return -1;
	}

	private int packedIndex(int idx) {
		return (idHash.length - idSize) + idx;
	}

	void add(int key, int cnt) throws TableFullException {
		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.

		int j = slot(key);
		for (;;) {
			long v = idHash[j];
			if (v == 0) {
				// Empty slot in the table, store here.
				if (idGrowAt <= idSize) {
					grow();
					j = slot(key);
					continue;
				}
				idHash[j] = pair(key, cnt);
				idSize++;
				return;

			} else if (keyOf(v) == key) {
				// Same key, increment the counter. If it overflows, fail
				// indexing to prevent the key from being impacted.
				//
				idHash[j] = pair(key, countOf(v) + cnt);
				return;

			} else if (++j >= idHash.length) {
				j = 0;
			}
		}
	}

	private static long pair(int key, long cnt) throws TableFullException {
		if (MAX_COUNT < cnt)
			throw new TableFullException();
		return (((long) key) << KEY_SHIFT) | cnt;
	}

	private int slot(int key) {
		// We use 31 - idHashBits because the upper bit was already forced
		// to be 0 and we want the remaining high bits to be used as the
		// table slot.
		//
		return key >>> (31 - idHashBits);
	}

	private static int growAt(int idHashBits) {
		return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
	}

	@SuppressWarnings("UnusedException")
	private void grow() throws TableFullException {
		if (idHashBits == 30)
			throw new TableFullException();

		long[] oldHash = idHash;
		int oldSize = idHash.length;

		idHashBits++;
		idGrowAt = growAt(idHashBits);

		try {
			idHash = new long[1 << idHashBits];
		} catch (OutOfMemoryError noMemory) {
			throw TABLE_FULL_OUT_OF_MEMORY;
		}

		for (int i = 0; i < oldSize; i++) {
			long v = oldHash[i];
			if (v != 0) {
				int j = slot(keyOf(v));
				while (idHash[j] != 0)
					if (++j >= idHash.length)
						j = 0;
				idHash[j] = v;
			}
		}
	}

	private static int keyOf(long v) {
		return (int) (v >>> KEY_SHIFT);
	}

	private static long countOf(long v) {
		return v & MAX_COUNT;
	}

	/** Thrown by {@code create()} when file is too large. */
	public static class TableFullException extends Exception {
		private static final long serialVersionUID = 1L;
	}
}

Back to the top