package edu.umd.cloud9.collection.trecweb;

import edu.umd.cloud9.collection.WebDocument;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.WritableUtils;

/* loaded from: input_file:edu/umd/cloud9/collection/trecweb/TrecWebDocument.class */
public class TrecWebDocument extends WebDocument {
    public static final String XML_START_TAG = "<DOC>";
    public static final String XML_END_TAG = "</DOC>";
    private String docid;
    private String content;
    private String url;
    private static DataInputStream fsin;
    private static byte[] startTag;
    private static byte[] endTag;
    private static DataOutputBuffer buffer = new DataOutputBuffer();

    public TrecWebDocument() {
        try {
            startTag = "<DOC>".getBytes("utf-8");
            endTag = "</DOC>".getBytes("utf-8");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.docid);
        byte[] bytes = this.content.getBytes("UTF-8");
        WritableUtils.writeVInt(dataOutput, bytes.length);
        dataOutput.write(bytes, 0, bytes.length);
    }

    public void readFields(DataInput dataInput) throws IOException {
        this.docid = dataInput.readUTF();
        int readVInt = WritableUtils.readVInt(dataInput);
        byte[] bArr = new byte[readVInt];
        dataInput.readFully(bArr, 0, readVInt);
        this.content = new String(bArr, "UTF-8");
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getDocid() {
        return this.docid;
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getContent() {
        return this.content;
    }

    @Override // edu.umd.cloud9.collection.WebDocument
    public String getURL() {
        return this.url;
    }

    public static void readDocument(TrecWebDocument trecWebDocument, String str) {
        if (str == null) {
            throw new RuntimeException("Error, can't read null string!");
        }
        int indexOf = str.indexOf("<DOCNO>");
        if (indexOf == -1) {
            throw new RuntimeException("Unable to find DOCNO tag!");
        }
        trecWebDocument.docid = str.substring(indexOf + 7, str.indexOf("</DOCNO>", indexOf));
        int indexOf2 = str.indexOf("<DOCHDR>");
        if (indexOf2 == -1) {
            throw new RuntimeException("Unable to find DOCHDR tag!");
        }
        trecWebDocument.url = str.substring(indexOf2 + 9, str.indexOf(" ", indexOf2));
        int indexOf3 = str.indexOf("</DOCHDR>");
        if (indexOf3 == -1) {
            throw new RuntimeException("Unable to find DOCHDR tag!");
        }
        trecWebDocument.content = str.substring(indexOf3 + 9, str.length() - 6);
    }

    public static boolean readNextTrecWebDocument(TrecWebDocument trecWebDocument, DataInputStream dataInputStream) throws IOException {
        fsin = dataInputStream;
        if (!readUntilMatch(startTag, false)) {
            return false;
        }
        try {
            buffer.write(startTag);
            if (!readUntilMatch(endTag, true)) {
                return false;
            }
            readDocument(trecWebDocument, new String(buffer.getData()));
            buffer.reset();
            return true;
        } finally {
            buffer.reset();
        }
    }

    private static boolean readUntilMatch(byte[] bArr, boolean z) throws IOException {
        int i = 0;
        while (true) {
            int read = fsin.read();
            if (read == -1) {
                return false;
            }
            if (z) {
                buffer.write(read);
            }
            if (read == bArr[i]) {
                i++;
                if (i >= bArr.length) {
                    return true;
                }
            } else {
                i = 0;
            }
        }
    }
}
