ppkarwasz commented on code in PR #723:
URL: https://github.com/apache/commons-compress/pull/723#discussion_r2406421591
##########
src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java:
##########
@@ -225,164 +235,174 @@ public int read(final byte[] b, final int off, final
int len) throws IOException
return currentInputStream.read(b, off, len);
}
- private int read16(final DataInputStream dataIn) throws IOException {
- final int value = dataIn.readUnsignedShort();
- count(2);
- return Integer.reverseBytes(value) >>> 16;
+ private static int readUnsignedByte(InputStream in) throws IOException {
+ final int value = in.read();
+ if (value == -1) {
+ throw new EOFException();
+ }
+ return value & 0xff;
}
- private int read32(final DataInputStream dataIn) throws IOException {
- final int value = dataIn.readInt();
- count(4);
- return Integer.reverseBytes(value);
+ private int readSwappedUnsignedShort() throws IOException {
+ final int value = EndianUtils.readSwappedUnsignedShort(in);
+ count(2);
+ return value;
}
- private int read8(final DataInputStream dataIn) throws IOException {
- final int value = dataIn.readUnsignedByte();
+ private int readUnsignedByte() throws IOException {
+ final int value = readUnsignedByte(in);
count(1);
- return value;
+ return value & 0xff;
}
- private void readExtraData(final int firstHeaderSize, final
DataInputStream firstHeader, final LocalFileHeader localFileHeader) throws
IOException {
+ private static void readExtraData(final int firstHeaderSize, final
InputStream firstHeader, final LocalFileHeader localFileHeader) throws
IOException {
if (firstHeaderSize >= 33) {
- localFileHeader.extendedFilePosition = read32(firstHeader);
+ localFileHeader.extendedFilePosition =
readSwappedInteger(firstHeader);
if (firstHeaderSize >= 45) {
- localFileHeader.dateTimeAccessed = read32(firstHeader);
- localFileHeader.dateTimeCreated = read32(firstHeader);
- localFileHeader.originalSizeEvenForVolumes =
read32(firstHeader);
- pushedBackBytes(12);
+ localFileHeader.dateTimeAccessed =
readSwappedInteger(firstHeader);
+ localFileHeader.dateTimeCreated =
readSwappedInteger(firstHeader);
+ localFileHeader.originalSizeEvenForVolumes =
readSwappedInteger(firstHeader);
}
- pushedBackBytes(4);
}
}
+ /**
+ * Scans for the next valid ARJ header.
+ *
+ * @return The header bytes, or {@code null} if end of archive.
+ * @throws EOFException If the end of the stream is reached before a valid
header is found.
+ * @throws IOException If an I/O error occurs.
+ */
private byte[] readHeader() throws IOException {
- boolean found = false;
- byte[] basicHeaderBytes = null;
- do {
+ byte[] basicHeaderBytes;
+ // TODO: Explain why we are scanning for a valid ARJ header
+ // and don't throw, when an invalid/corrupted header is found,
+ // which might indicate a corrupted archive.
+ while (true) {
Review Comment:
The current code scans for the ARJ magic anywhere, ignoring invalid or
corrupted entries. This favors the reading of self-extracting archives over
strict validation of normal archives.
In a follow-up PR, I would like to:
* Use scanning **only** to locate the Main Archive Header (validate magic,
lengths, CRC).
* After AMH, parse **strictly and contiguously** to end-of-archive; no
skipping of invalid entries mid-stream.
* Add a builder option (e.g., `lenient` or `selfExtracting`) to enable AMH
scanning, while by default assume that the ARJ archive starts at offset 0.
@garydgregory, what do you think?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]