From f9eb3aecac9a4a8f1ec86ce131b7085e833f313a Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Wed, 2 May 2018 15:44:51 -0400
Subject: fsck: actually fsck blob data

commit 7ac4f3a007e2567f9d2492806186aa063f9a08d6 upstream.

Because fscking a blob has always been a noop, we didn't
bother passing around the blob data. In preparation for
content-level checks, let's fix up a few things:

  1. The fsck_object() function just returns success for any
     blob. Let's a noop fsck_blob(), which we can fill in
     with actual logic later.

  2. The fsck_loose() function in builtin/fsck.c
     just threw away blob content after loading it. Let's
     hold onto it until after we've called fsck_object().

     The easiest way to do this is to just drop the
     parse_loose_object() helper entirely. Incidentally,
     this also fixes a memory leak: if we successfully
     loaded the object data but did not parse it, we would
     have left the function without freeing it.

  3. When fsck_loose() loads the object data, it
     does so with a custom read_loose_object() helper. This
     function streams any blobs, regardless of size, under
     the assumption that we're only checking the sha1.

     Instead, let's actually load blobs smaller than
     big_file_threshold, as the normal object-reading
     code-paths would do. This lets us fsck small files, and
     a NULL return is an indication that the blob was so big
     that it needed to be streamed, and we can pass that
     information along to fsck_blob().

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 builtin/fsck.c | 51 +++++++++++++++++++++++++-------------------------
 fsck.c         |  8 +++++++-
 sha1_file.c    |  2 +-
 3 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/builtin/fsck.c b/builtin/fsck.c
index 4b91ee95e6..95053ec02f 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -318,7 +318,7 @@ static void check_connectivity(void)
 	}
 }
 
-static int fsck_obj(struct object *obj)
+static int fsck_obj(struct object *obj, void *buffer, unsigned long size)
 {
 	if (obj->flags & SEEN)
 		return 0;
@@ -330,7 +330,7 @@ static int fsck_obj(struct object *obj)
 
 	if (fsck_walk(obj, NULL, &fsck_obj_options))
 		objerror(obj, "broken links");
-	if (fsck_object(obj, NULL, 0, &fsck_obj_options))
+	if (fsck_object(obj, buffer, size, &fsck_obj_options))
 		return -1;
 
 	if (obj->type == OBJ_TREE) {
@@ -376,7 +376,7 @@ static int fsck_obj_buffer(const unsigned char *sha1, enum object_type type,
 		return error("%s: object corrupt or missing", sha1_to_hex(sha1));
 	}
 	obj->flags = HAS_OBJ;
-	return fsck_obj(obj);
+	return fsck_obj(obj, buffer, size);
 }
 
 static int default_refs;
@@ -476,43 +476,42 @@ static void get_default_heads(void)
 	}
 }
 
-static struct object *parse_loose_object(const unsigned char *sha1,
-					 const char *path)
+static int fsck_loose(const unsigned char *sha1, const char *path, void *data)
 {
 	struct object *obj;
-	void *contents;
 	enum object_type type;
 	unsigned long size;
+	void *contents;
 	int eaten;
 
-	if (read_loose_object(path, sha1, &type, &size, &contents) < 0)
-		return NULL;
+	if (read_loose_object(path, sha1, &type, &size, &contents) < 0) {
+		errors_found |= ERROR_OBJECT;
+		error("%s: object corrupt or missing: %s",
+		      sha1_to_hex(sha1), path);
+		return 0; /* keep checking other objects */
+	}
 
 	if (!contents && type != OBJ_BLOB)
 		die("BUG: read_loose_object streamed a non-blob");
 
 	obj = parse_object_buffer(sha1, type, size, contents, &eaten);
 
+	if (!obj) {
+		errors_found |= ERROR_OBJECT;
+		error("%s: object could not be parsed: %s",
+		      sha1_to_hex(sha1), path);
+		if (!eaten)
+			free(contents);
+		return 0; /* keep checking other objects */
+	}
+
+	obj->flags = HAS_OBJ;
+	if (fsck_obj(obj, contents, size))
+		errors_found |= ERROR_OBJECT;
+
 	if (!eaten)
 		free(contents);
-	return obj;
-}
-
-static int fsck_loose(const unsigned char *sha1, const char *path, void *data)
-{
-	struct object *obj = parse_loose_object(sha1, path);
-
-	if (!obj) {
-		errors_found |= ERROR_OBJECT;
-		error("%s: object corrupt or missing: %s",
-		      sha1_to_hex(sha1), path);
-		return 0; /* keep checking other objects */
-	}
-
-	obj->flags = HAS_OBJ;
-	if (fsck_obj(obj))
-		errors_found |= ERROR_OBJECT;
-	return 0;
+	return 0; /* keep checking other objects, even if we saw an error */
 }
 
 static int fsck_cruft(const char *basename, const char *path, void *data)
diff --git a/fsck.c b/fsck.c
index 1336ead9eb..56546206ae 100644
--- a/fsck.c
+++ b/fsck.c
@@ -893,6 +893,12 @@ static int fsck_tag(struct tag *tag, const char *data,
 	return fsck_tag_buffer(tag, data, size, options);
 }
 
+static int fsck_blob(struct blob *blob, const char *buf,
+		     unsigned long size, struct fsck_options *options)
+{
+	return 0;
+}
+
 int fsck_object(struct object *obj, void *data, unsigned long size,
 	struct fsck_options *options)
 {
@@ -900,7 +906,7 @@ int fsck_object(struct object *obj, void *data, unsigned long size,
 		return report(options, obj, FSCK_MSG_BAD_OBJECT_SHA1, "no valid object to fsck");
 
 	if (obj->type == OBJ_BLOB)
-		return 0;
+		return fsck_blob((struct blob *)obj, data, size, options);
 	if (obj->type == OBJ_TREE)
 		return fsck_tree((struct tree *) obj, options);
 	if (obj->type == OBJ_COMMIT)
diff --git a/sha1_file.c b/sha1_file.c
index 0a609a5772..65db803392 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -3876,7 +3876,7 @@ int read_loose_object(const char *path,
 		goto out;
 	}
 
-	if (*type == OBJ_BLOB) {
+	if (*type == OBJ_BLOB && *size > big_file_threshold) {
 		if (check_stream_sha1(&stream, hdr, *size, path, expected_sha1) < 0)
 			goto out;
 	} else {
-- 
2.17.0.921.gf22659ad46

