package edu.umd.hooka.corpora;

import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import edu.umd.hooka.alignment.aer.ReferenceAlignment;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:edu/umd/hooka/corpora/ParallelCorpusReader.class */
public class ParallelCorpusReader extends DefaultHandler {
    private ParallelChunk resultChunk;
    PChunkCallback cb_;
    SAXParser sp;
    ParallelChunk pchunk;
    Language lang;
    LanguagePair langpair;
    StringBuffer tempVal;
    String docName;
    int pchunkCount;
    int chunkCount;
    int refAlignCount;

    /* loaded from: input_file:edu/umd/hooka/corpora/ParallelCorpusReader$ChunkSetCB.class */
    static class ChunkSetCB implements PChunkCallback {
        ParallelCorpusReader pcr_;

        ChunkSetCB(ParallelCorpusReader parallelCorpusReader) {
            this.pcr_ = parallelCorpusReader;
        }

        @Override // edu.umd.hooka.corpora.ParallelCorpusReader.PChunkCallback
        public void handlePChunk(ParallelChunk parallelChunk) {
            this.pcr_.resultChunk = parallelChunk;
        }
    }

    /* loaded from: input_file:edu/umd/hooka/corpora/ParallelCorpusReader$PChunkCallback.class */
    public interface PChunkCallback {
        void handlePChunk(ParallelChunk parallelChunk);
    }

    public ParallelCorpusReader() {
        this.resultChunk = null;
        this.sp = null;
        this.pchunk = null;
        this.pchunkCount = 0;
        this.chunkCount = 0;
        this.refAlignCount = 0;
        this.cb_ = new ChunkSetCB(this);
        try {
            this.sp = SAXParserFactory.newInstance().newSAXParser();
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException("Couldn't build XML parser");
        }
    }

    private ParallelCorpusReader(PChunkCallback pChunkCallback) {
        this.resultChunk = null;
        this.sp = null;
        this.pchunk = null;
        this.pchunkCount = 0;
        this.chunkCount = 0;
        this.refAlignCount = 0;
        this.cb_ = pChunkCallback;
        try {
            this.sp = SAXParserFactory.newInstance().newSAXParser();
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException("Failed " + e);
        }
    }

    public ParallelChunk parseString(String str) {
        this.resultChunk = null;
        try {
            this.sp.parse(new InputSource(new StringReader(str)), this);
            return this.resultChunk;
        } catch (IOException e) {
            this.resultChunk = null;
            e.printStackTrace();
            throw new RuntimeException("ioe: " + e);
        } catch (SAXException e2) {
            this.resultChunk = null;
            e2.printStackTrace();
            throw new RuntimeException("SaxE: " + e2 + "\n" + str);
        }
    }

    public static void parseXMLDocument(String str, PChunkCallback pChunkCallback) {
        try {
            SAXParserFactory.newInstance().newSAXParser().parse(str, new ParallelCorpusReader(pChunkCallback));
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParserConfigurationException e2) {
            e2.printStackTrace();
        } catch (SAXException e3) {
            e3.printStackTrace();
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        if (str3.equalsIgnoreCase("pchunk")) {
            this.pchunk = new ParallelChunk();
            this.pchunk.setName(attributes.getValue("name"));
            return;
        }
        if (str3.equalsIgnoreCase("s")) {
            this.lang = Language.languageForISO639_1(attributes.getValue("lang"));
            this.tempVal = new StringBuffer();
        } else if (str3.equalsIgnoreCase("wordalignment")) {
            this.tempVal = new StringBuffer();
            this.langpair = LanguagePair.languageForISO639_1Pair(attributes.getValue("langpair"));
        } else {
            if (!str3.equalsIgnoreCase("pdoc")) {
                throw new SAXException("Unknown tag: " + str3);
            }
            this.docName = attributes.getValue("name");
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void characters(char[] cArr, int i, int i2) throws SAXException {
        if (this.tempVal != null) {
            this.tempVal.append(cArr, i, i2);
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endElement(String str, String str2, String str3) throws SAXException {
        if (str3.equalsIgnoreCase("pchunk")) {
            this.pchunkCount++;
            this.cb_.handlePChunk(this.pchunk);
            return;
        }
        if (str3.equalsIgnoreCase("s")) {
            if (this.tempVal.toString().trim().length() == 0) {
                System.err.println(this.pchunk.getName() + ": Empty segment for lang=" + this.lang);
                return;
            }
            this.pchunk.addChunk(this.lang, new Chunk(this.tempVal.toString().trim()));
            this.chunkCount++;
            this.tempVal = null;
            return;
        }
        if (!str3.equalsIgnoreCase("wordalignment")) {
            if (!str3.equalsIgnoreCase("pdoc")) {
                throw new SAXException("Unknown tag: " + str3);
            }
            System.err.println("Finished parsing document " + this.docName);
            System.err.println("  pchunks: " + this.pchunkCount);
            System.err.println("  chunks: " + this.chunkCount);
            System.err.println("  ref alignments: " + this.refAlignCount);
            return;
        }
        Chunk chunk = this.pchunk.getChunk(this.langpair.getSource());
        if (chunk == null) {
            throw new RuntimeException("PChunk doesn't contain data for lang: " + this.langpair.getSource() + ".  Note: manual word alignment data must follow the chunk data.");
        }
        Chunk chunk2 = this.pchunk.getChunk(this.langpair.getTarget());
        if (chunk2 == null) {
            throw new RuntimeException("PChunk doesn't contain data for lang: " + this.langpair.getTarget() + ".  Note: manual word alignment data must follow the chunk data.");
        }
        ReferenceAlignment referenceAlignment = new ReferenceAlignment(chunk.getLength(), chunk2.getLength());
        referenceAlignment.addAlignmentPointsPharaoh(this.tempVal.toString().trim());
        this.pchunk.addReferenceAlignment(this.langpair, referenceAlignment);
        this.refAlignCount++;
        this.tempVal = null;
    }

    private static void convertToXMLDocument(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, boolean z) {
        try {
            if (z) {
                if (str4 == null || str4.equals(AnchorTextConstants.EMPTY_STRING)) {
                    throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!");
                }
            } else if (str4 != null && !str4.equals(AnchorTextConstants.EMPTY_STRING)) {
                throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!");
            }
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str2), "UTF8"));
            BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader(new FileInputStream(str3), "UTF8"));
            BufferedReader bufferedReader3 = null;
            if (z) {
                bufferedReader3 = new BufferedReader(new InputStreamReader(new FileInputStream(str4), "UTF8"));
            }
            OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(str5), str6);
            Language languageForISO639_1 = Language.languageForISO639_1(str8);
            Language languageForISO639_12 = Language.languageForISO639_1(str7);
            LanguagePair languagePair = null;
            if (z) {
                languagePair = LanguagePair.languageForISO639_1Pair(str7 + "-" + str8);
            }
            System.err.println("Reading " + languageForISO639_12 + " from: " + str2);
            System.err.println("Reading " + languageForISO639_1 + " from: " + str3);
            if (z) {
                System.err.println("Reading alignments (" + languagePair + ") from: " + str4);
            }
            BufferedWriter bufferedWriter = new BufferedWriter(outputStreamWriter);
            bufferedWriter.write("<?xml version=\"1.0\" encoding=\"" + outputStreamWriter.getEncoding() + "\"?>");
            bufferedWriter.newLine();
            int lastIndexOf = str2.lastIndexOf(47);
            if (lastIndexOf < 0 || lastIndexOf >= str2.length()) {
                lastIndexOf = 0;
            }
            bufferedWriter.write("<pdoc name=\"" + str2.substring(lastIndexOf + 1) + "\">");
            bufferedWriter.newLine();
            int i = 0;
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                i++;
                String readLine2 = bufferedReader2.readLine();
                if (readLine2 == null) {
                    System.err.println("WARNING: " + str3 + " has fewer lines than " + str2);
                    break;
                }
                String str9 = null;
                if (z) {
                    str9 = bufferedReader3.readLine();
                    if (str9 == null) {
                        System.err.println(str4 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
                    }
                }
                Chunk chunk = new Chunk(readLine);
                Chunk chunk2 = new Chunk(readLine2);
                ParallelChunk parallelChunk = new ParallelChunk();
                parallelChunk.setName(str + i);
                parallelChunk.addChunk(languageForISO639_1, chunk2);
                parallelChunk.addChunk(languageForISO639_12, chunk);
                if (str9 != null) {
                    ReferenceAlignment referenceAlignment = new ReferenceAlignment(chunk.getLength(), chunk2.getLength());
                    try {
                        referenceAlignment.addAlignmentPointsPharaoh(str9);
                        parallelChunk.addReferenceAlignment(languagePair, referenceAlignment);
                    } catch (RuntimeException e) {
                        System.err.println("Couldn't set alignment points for sentence # " + i);
                        System.err.println(" " + languageForISO639_12 + ": len=" + chunk.getLength() + " words=" + chunk);
                        System.err.println(" " + languageForISO639_1 + ": len=" + chunk2.getLength() + " words=" + chunk2);
                        System.err.println(" " + languagePair + ": " + str9);
                    }
                }
                bufferedWriter.write(parallelChunk.toXML());
            }
            if (bufferedReader2.readLine() != null) {
                System.err.println("WARNING: " + str3 + " has more lines than " + str2);
            }
            bufferedWriter.write("</pdoc>");
            System.out.println("Converted " + i + " sentences");
            bufferedWriter.newLine();
            bufferedWriter.close();
            bufferedReader.close();
            bufferedReader2.close();
            if (z) {
                bufferedReader3.close();
            }
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    public static void main(String[] strArr) {
        convertToXMLDocument("koen_jhu_", "/Users/redpony/bitexts/kkn-eng-alignments/kkn.utf8", "/Users/redpony/bitexts/kkn-eng-alignments/eng", "/Users/redpony/bitexts/kkn-eng-alignments/align", "/tmp/foo.xml", "utf8", "ko", "en", true);
    }
}
