org.apache.tika.parser.chm.accessor.ChmPmglHeader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.chm.accessor;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.chm.assertion.ChmAssert;
import org.apache.tika.parser.chm.core.ChmConstants;
import org.apache.tika.parser.chm.exception.ChmParsingException;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Description There are two types of directory chunks -- index chunks, and
* listing chunks. The index chunk will be omitted if there is only one listing
* chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004:
* DWORD Length of free space and/or quickref area at end of directory chunk
* 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when
* reading directory in sequence (-1 if this is the first listing chunk) 0010:
* DWORD Chunk number of next listing chunk when reading directory in sequence
* (-1 if this is the last listing chunk) 0014: Directory listing entries (to
* quickref area) Sorted by filename; the sort is case-insensitive The quickref
* area is written backwards from the end of the chunk. One quickref entry
* exists for every n entries in the file, where n is calculated as 1 + (1 <<
* quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of
* entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0
* Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD
* Offset of entry 3n from entry 0 ... The format of a directory listing entry
* is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT:
* content section ENCINT: offset ENCINT: length The offset is from the
* beginning of the content section the file is in, after the section has been
* decompressed (if appropriate). The length also refers to length of the file
* in the section after decompression. There are two kinds of file represented
* in the directory: user data and format related files. The files which are
* format-related have names which begin with '::', the user data files have
* names which begin with "/".
*
* {@link http
* ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
* /?show-translation-form=1 }
*
* @author olegt
*
*/
public class ChmPmglHeader implements ChmAccessor {
private static final long serialVersionUID = -6139486487475923593L;
private byte[] signature;
private long free_space; /* 4 */
private long unknown_0008; /* 8 */
private int block_prev; /* c */
private int block_next; /* 10 */
/* local usage */
private int dataRemained;
private int currentPlace = 0;
public ChmPmglHeader() {
signature = ChmConstants.PMGL.getBytes(UTF_8); /*
* 0
* (PMGL
* )
*/
}
private int getDataRemained() {
return dataRemained;
}
private void setDataRemained(int dataRemained) {
this.dataRemained = dataRemained;
}
private int getCurrentPlace() {
return currentPlace;
}
private void setCurrentPlace(int currentPlace) {
this.currentPlace = currentPlace;
}
public long getFreeSpace() {
return free_space;
}
public void setFreeSpace(long free_space) throws TikaException {
if (free_space < 0) {
throw new TikaException("Bad PMGLheader.FreeSpace="+free_space);
}
this.free_space = free_space;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("signatute:=" + new String(getSignature(), UTF_8) + ", ");
sb.append("free space:=" + getFreeSpace() + ", ");
sb.append("unknown0008:=" + getUnknown0008() + ", ");
sb.append("prev block:=" + getBlockPrev() + ", ");
sb.append("next block:=" + getBlockNext()
+ System.getProperty("line.separator"));
return sb.toString();
}
protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader,
int count) throws TikaException {
ChmAssert.assertByteArrayNotNull(data);
this.setDataRemained(data.length);
System.arraycopy(data, 0, chmPmglHeader.signature, 0, count);
this.setCurrentPlace(this.getCurrentPlace() + count);
this.setDataRemained(this.getDataRemained() - count);
}
private int unmarshalInt32(byte[] data) throws TikaException {
ChmAssert.assertByteArrayNotNull(data);
int dest;
if (4 > this.getDataRemained())
throw new TikaException("4 > dataLenght");
dest = (data[this.getCurrentPlace()] & 0xff)
| (data[this.getCurrentPlace() + 1] & 0xff) << 8
| (data[this.getCurrentPlace() + 2] & 0xff) << 16
| (data[this.getCurrentPlace() + 3] & 0xff) << 24;
this.setCurrentPlace(this.getCurrentPlace() + 4);
this.setDataRemained(this.getDataRemained() - 4);
return dest;
}
private long unmarshalUInt32(byte[] data) throws ChmParsingException {
ChmAssert.assertByteArrayNotNull(data);
long dest;
if (4 > getDataRemained())
throw new ChmParsingException("4 > dataLenght");
dest = (data[this.getCurrentPlace()] & 0xff)
| (data[this.getCurrentPlace() + 1] & 0xff) << 8
| (data[this.getCurrentPlace() + 2] & 0xff) << 16
| (data[this.getCurrentPlace() + 3] & 0xff) << 24;
setDataRemained(this.getDataRemained() - 4);
this.setCurrentPlace(this.getCurrentPlace() + 4);
return dest;
}
// @Override
public void parse(byte[] data, ChmPmglHeader chmPmglHeader) throws TikaException {
if (data.length < ChmConstants.CHM_PMGL_LEN)
throw new TikaException(ChmPmglHeader.class.getName()
+ " we only know how to deal with a 0x14 byte structures");
/* unmarshal fields */
chmPmglHeader.unmarshalCharArray(data, chmPmglHeader,
ChmConstants.CHM_SIGNATURE_LEN);
chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data));
chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data));
chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data));
chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data));
/* check structure */
if (!new String(chmPmglHeader.getSignature(), UTF_8).equals(ChmConstants.PMGL))
throw new ChmParsingException(ChmPmglHeader.class.getName()
+ " pmgl != pmgl.signature");
}
public byte[] getSignature() {
return signature;
}
protected void setSignature(byte[] signature) {
this.signature = signature;
}
public long getUnknown0008() {
return unknown_0008;
}
protected void setUnknown0008(long unknown_0008) {
this.unknown_0008 = unknown_0008;
}
public int getBlockPrev() {
return block_prev;
}
protected void setBlockPrev(int block_prev) {
this.block_prev = block_prev;
}
public int getBlockNext() {
return block_next;
}
protected void setBlockNext(int block_next) {
this.block_next = block_next;
}
}