When joining large files where one file has many many many instances
of the same matching field, memory use is excessive (since join read
all matching lines from both files into RAM before proceeding).
Below are diffs that solve this problem.
Andy Jewell
[EMAIL PROTECTED]
--- textutils-2.0/src/join.c Sun Jul 4 03:38:02 1999
+++ join.c Wed Oct 25 11:27:21 2000
@@ -486,6 +501,7 @@
struct seq seq1, seq2;
struct line line;
int diff, i, j, eof1, eof2;
+ int end1, end2;
/* Read the first line of each file. */
initseq (&seq1);
@@ -515,35 +531,85 @@
continue;
}
- /* Keep reading lines from file1 as long as they continue to
- match the current line from file2. */
+ /* Read lines from file1 and file2 until one of them stops
matching the other */
eof1 = 0;
- do
- if (!getseq (fp1, &seq1))
- {
- eof1 = 1;
- ++seq1.count;
- break;
- }
- while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
-
- /* Keep reading lines from file2 as long as they continue to
- match the current line from file1. */
eof2 = 0;
- do
- if (!getseq (fp2, &seq2))
- {
- eof2 = 1;
- ++seq2.count;
- break;
- }
- while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
+ end1 = 0;
+ end2 = 0;
- if (print_pairables)
+ while (1)
{
- for (i = 0; i < seq1.count - 1; ++i)
+ if (!getseq (fp1, &seq1))
+ {
+ eof1 = 1;
+ end1 = 1;
+ ++seq1.count;
+ break;
+ }
+
+ if (keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]))
+ {
+ end1 = 1;
+ break;
+ }
+
+ if (!getseq (fp2, &seq2))
+ {
+ eof2 = 1;
+ end2 = 1;
+ ++seq2.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]))
+ {
+ end2 = 1;
+ break;
+ }
+ }
+
+ if (end1)
+ {
+ for (i = 0; i < seq2.count; ++i)
+ {
+ for (j = 0; j < seq1.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[i]);
+ freeline (&seq2.lines[i]);
+ }
+ while (1) {
+ seq2.count = 0;
+ if (!getseq (fp2, &seq2))
+ {
+ eof2 = 1;
+ ++seq2.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
+ for (j = 0; j < seq1.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[j], &seq2.lines[0]);
+ freeline (&seq2.lines[0]);
+ }
+ }
+ else /* end2 */
+ {
+ for (i = 0; i < seq1.count; ++i)
+ {
+ for (j = 0; j < seq2.count - 1; ++j)
+ if (print_pairables) prjoin (&seq1.lines[i], &seq2.lines[j]);
+ freeline (&seq1.lines[i]);
+ }
+ while (1) {
+ seq1.count = 0;
+ if (!getseq (fp1, &seq1))
+ {
+ eof1 = 1;
+ ++seq1.count;
+ break;
+ }
+ if (keycmp (&seq1.lines[0], &seq2.lines[0])) break;
for (j = 0; j < seq2.count - 1; ++j)
- prjoin (&seq1.lines[i], &seq2.lines[j]);
+ if (print_pairables) prjoin (&seq1.lines[0], &seq2.lines[j]);
+ freeline (&seq1.lines[0]);
+ }
}
for (i = 0; i < seq1.count - 1; ++i)
@@ -555,7 +621,7 @@
}
else
seq1.count = 0;
-
+
for (i = 0; i < seq2.count - 1; ++i)
freeline (&seq2.lines[i]);
if (!eof2)
_______________________________________________
Bug-textutils mailing list
[EMAIL PROTECTED]
http://mail.gnu.org/mailman/listinfo/bug-textutils