On Wed, Jun 24, 2015 at 11:10 PM, Johannes Schindelin
<[email protected]> wrote:
> Hi Paul,
>
> On 2015-06-18 13:25, Paul Tan wrote:
>
>> diff --git a/builtin/am.c b/builtin/am.c
>> index e9a3687..7b97ea8 100644
>> --- a/builtin/am.c
>> +++ b/builtin/am.c
>> @@ -121,6 +121,96 @@ static void am_destroy(const struct am_state *state)
>> strbuf_release(&sb);
>> }
>>
>> +/*
>> + * Returns 1 if the file looks like a piece of email a-la RFC2822, 0
>> otherwise.
>> + * We check this by grabbing all the non-indented lines and seeing if they
>> look
>> + * like they begin with valid header field names.
>> + */
>> +static int is_email(const char *filename)
>> +{
>> + struct strbuf sb = STRBUF_INIT;
>> + FILE *fp = xfopen(filename, "r");
>> + int ret = 1;
>> +
>> + while (!strbuf_getline(&sb, fp, '\n')) {
>> + const char *x;
>> +
>> + strbuf_rtrim(&sb);
>> +
>> + if (!sb.len)
>> + break; /* End of header */
>> +
>> + /* Ignore indented folded lines */
>> + if (*sb.buf == '\t' || *sb.buf == ' ')
>> + continue;
>> +
>> + /* It's a header if it matches the regexp "^[!-9;-~]+:" */
>
> Why not just compile a regex and use it here? We use regexes elsewhere
> anyway...
Ah, you're right. A regular expression would definitely be clearer.
I've fixed it on my end.
>> +/**
>> + * Attempts to detect the patch_format of the patches contained in `paths`,
>> + * returning the PATCH_FORMAT_* enum value. Returns PATCH_FORMAT_UNKNOWN if
>> + * detection fails.
>> + */
>> +static int detect_patch_format(struct string_list *paths)
>> +{
>> + enum patch_format ret = PATCH_FORMAT_UNKNOWN;
>> + struct strbuf l1 = STRBUF_INIT;
>> + struct strbuf l2 = STRBUF_INIT;
>> + struct strbuf l3 = STRBUF_INIT;
>> + FILE *fp;
>> +
>> + /*
>> + * We default to mbox format if input is from stdin and for directories
>> + */
>> + if (!paths->nr || !strcmp(paths->items->string, "-") ||
>> + is_directory(paths->items->string)) {
>> + ret = PATCH_FORMAT_MBOX;
>> + goto done;
>> + }
>> +
>> + /*
>> + * Otherwise, check the first 3 lines of the first patch, starting
>> + * from the first non-blank line, to try to detect its format.
>> + */
>> + fp = xfopen(paths->items->string, "r");
>> + while (!strbuf_getline(&l1, fp, '\n')) {
>> + strbuf_trim(&l1);
>> + if (l1.len)
>> + break;
>> + }
>> + strbuf_getline(&l2, fp, '\n');
>
> We should test the return value of `strbuf_getline()`; if EOF was reached
> already, `strbuf_getwholeline()` does not touch the strbuf. I know, the
> strbuf is still initialized empty here, but it is too easy to forget when
> changing this code.
Ah OK. I'll vote for doing a strbuf_reset() just before the
strbuf_getline() though.
>> + strbuf_trim(&l2);
>> + strbuf_getline(&l3, fp, '\n');
>> + strbuf_trim(&l3);
>> + fclose(fp);
>> +
>> + if (starts_with(l1.buf, "From ") || starts_with(l1.buf, "From: "))
>> + ret = PATCH_FORMAT_MBOX;
>
> Hmm. We can test that earlier and return without reading from the file any
> further, I think.
The "reading 3 lines at the beginning" logic is meant to support a
later patch where support for StGit and mercurial patches is added.
That said, it's true that we don't need to read 3 lines in this patch,
so I think I will remove it in this patch.
>> + else if (l1.len && l2.len && l3.len && is_email(paths->items->string))
>> + ret = PATCH_FORMAT_MBOX;
>
> Maybe we can do better than this by folding the `is_email() function into
> this here function, reusing the same strbuf to read the lines and keeping
> track of the email header lines we saw... I would really like to avoid
> opening the same file twice just to figure out whether it is in email format.
Okay, how about every time we call a strbuf_getline(), we save the
line to a string_list as well? Like string_list_getline_crlf() below:
/**
* Like strbuf_getline(), but supports both '\n' and "\r\n" as line
* terminators.
*/
static int strbuf_getline_crlf(struct strbuf *sb, FILE *fp)
{
if (strbuf_getwholeline(sb, fp, '\n'))
return EOF;
if (sb->buf[sb->len - 1] == '\n') {
strbuf_setlen(sb, sb->len - 1);
if (sb->len > 0 && sb->buf[sb->len - 1] == '\r')
strbuf_setlen(sb, sb->len - 1);
}
return 0;
}
/**
* Like strbuf_getline_crlf(), but appends the line to a string_list and
* returns it as a string. Returns NULL on EOF.
*/
static const char *string_list_getline_crlf(struct string_list *list, FILE *fp)
{
struct strbuf sb = STRBUF_INIT;
struct string_list_item *item;
if (strbuf_getline_crlf(&sb, fp))
return NULL;
item = string_list_append_nodup(list, strbuf_detach(&sb, NULL));
return item->string;
}
So now, is_email() can have access to previously-read lines, and if it
needs some more, it can read more as well:
static int is_email(struct string_list *lines, FILE *fp)
{
const char *header_regex = "^[!-9;-~]+:";
regex_t regex;
int ret = 1;
size_t i;
if (regcomp(®ex, header_regex, REG_NOSUB | REG_EXTENDED))
die("Invalid search pattern: %s", header_regex);
for (i = 0; i < lines->nr || string_list_getline_crlf(lines, fp); i++) {
const char *line = lines->items[i].string;
if (!*line)
break; /* End of header */
/* Ignore indented folded lines */
if (*line == '\t' || *line == ' ')
continue;
/* It's a header if it matches header_regex */
if (regexec(®ex, line, 0, NULL, 0)) {
ret = 0;
goto done;
}
}
done:
regfree(®ex);
return ret;
}
Which solves the problem of opening the file 2 times. What do you think?
Regards,
Paul
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html