Wednesday, July 21, 2010

Project 2 - Parsing EAD Files

This does largely the same thing as the previous file, except it handles EAD files instead of a MARC database. There are significant differences as a result, like code to cycle through the .xml files in a directory.

/**
 *
 */
package crrasolrindexer;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedHashSet;

import javax.xml.parsers.*;

import org.apache.solr.client.solrj.SolrServerException;
import org.xml.sax.*;
import org.xml.sax.helpers.*;

/**
 * @author slittle2
 *
 *  *** NOTE: This is older than CRRA_EADRetriever. It is recommended ***
 *  *** that you use CRRA_EADRetriever instead.                          ***
 *  ***
 *
 *         EADDataRetreiver does the same thing as MarcDataRetriever, except
 *         that it does it on EAD files (XML format).
 *
 *         To this end, it cycles through the files in the given directory,
 *         parses each, extracts the relevant data, and puts it in a
 *         LinkedHashSet<IndexDatum>. Voila'!
 *
 *     TODO Doesn't distinguish all <unittitle> uses from the main one; others probably unnecessary
 *     TODO Some other fields probably useful, too.
 *     TODO Not all field, like <bioghist>, apparently grabbed correctly? Need to skip "head" elements, etc.
 *
 */
public class EADDataRetriever {

    /**
     * @param args
     */

    private static String testPathName = "C:/Documents and Settings/slittle2/Desktop/Index Data/ead/xml/";

    private static LinkedHashSet<IndexDatum> eadRecords = new LinkedHashSet<IndexDatum>();
    private static int recordQuantity = 0;
    private static boolean grabCharacters = false;
    private static boolean failFlag = false;
    private static boolean nestedTag = false;
    private static IndexDatum datum = new IndexDatum();

    // fieldName sets the field in datum that the parser passes data to,
    // and then datum is sent to eadRecords
    private static String fieldName = "";

    // Main() included more or less for testing purposes
    public static void main(String[] args) throws IOException, SolrServerException {
        eadLoader(testPathName);

        System.out.println("Number of records = " + eadRecords.size());
       
        Iterator<IndexDatum> iter = eadRecords.iterator(); // TODO remove test code
        while (iter.hasNext()) {
            datum = (IndexDatum) iter.next();
           
            // System.out.println(datum.toString()); // TODO remove test code

        }
       
        // Indexer.indexID(eadRecords, "http://localhost:8983/solr/core0/");
       
        // System.out.println("Successfully indexed... we hope.");
       
    }

    // Cycling through the files in the directory and loading each in turn
    public static LinkedHashSet<IndexDatum> eadLoader(String pathname) throws IOException, SolrServerException {

        String filename = "";

        // Initialize variables -- must be cleared every time parser is run!
        recordQuantity = 0;
        grabCharacters = false;
        failFlag = false;
        nestedTag = false;
        datum = new IndexDatum();
        eadRecords = new LinkedHashSet<IndexDatum>();

        // Cycle through all files; XmlFilter (below) makes sure each is
        // an XML file.
       
        File directory = new File( pathname );
        String[] eadFiles = directory.list( new XmlFilter() );

        // for (int i = 0; i < eadFiles.length; i++) { // TODO insert this into code, eliminating test below
        for (int i = 0; i < 4; i++) {

            filename = eadFiles[i];
            eadRecords.add(parse(pathname + filename)); // Returns all the EAD data
                                                        // from ONE file to
                                                        // eadRecords
            System.out.println("Successfully parsed " + filename + "!"); //TODO remove test code

            datum = new IndexDatum(); // This is REALLY, REALLY IMPORTANT! Bad things will happen if it is deleted!
        }

        // TODO test code
        Iterator<IndexDatum> iter = eadRecords.iterator();
        IndexDatum singleRecord = new IndexDatum();
        /* while (iter.hasNext()) { // Add each IndexDatum to the Index
            singleRecord = (IndexDatum) iter.next();
            System.out.println(singleRecord.toString()); // TODO remove
        }*/
       
        System.out.println("Number of records = " + eadRecords.size());
        // Indexer.indexID(eadRecords, "http://localhost:8983/solr/core0/"); // This is a hack--fix later TODO
        return eadRecords;
       
    }

    // To get the number of records found
    public static int getRecordQuantity() {
        return recordQuantity;
    }

    // To get the LinkedHashSet<IndexDatum> eadRecords
    public static LinkedHashSet<IndexDatum> getEadRecords() {
        return eadRecords;
    }

    // To return whether the parse succeeded
    public static boolean getFailFlag() {
        return failFlag;
    }

    // Parsing a given file into an IndexDatum
    private static IndexDatum parse(String filename)
            throws IOException {

        DefaultHandler handler = new EADHandler(); // Parse the file using the
                                                    // handler
        parseXmlFile(filename, handler, false);

        datum.setField("key", filename); //TODO this is temporary until we find a better key!
        datum.setField("text", "INSERT TEXT HERE");  // TODO change this to actual text
        datum.setField("type", "EAD");
       
        // datum = new IndexDatum(); // This is REALLY, REALLY IMPORTANT! Bad things will happen if it is deleted!
       
        return datum;
    }

    /*
     * EADHandler looks for the appropriate parts of the EAD record to grab:
     *
     * -> "title" <unittitle> <controlaccess><title>
     *
     * -> "subject" <controlaccess> <title role="subject"> <persname
     * role="subject"> <famname role="subject"> <corpname role="subject">
     * <subject> <subject source="local"> // Do I need to distinguish this from
     * the previous one? <geogname role="subject"> <genreform> <occupation>
     * <function>
     *
     * -> "author" <origination> <persname> <famname> <corpname> <controlaccess>
     * <persname> <famname> <corpname>
     *
     * -> "date" <unitdate>
     *
     * -> "note" <odd> <note> <accessrestrict> <legalstatus> <bibliography>
     * <scopecontent> <prefercite> <altformavail> <originalsloc> <sponsor>
     * <phystech> <userestrict> <acqinfo> <relatedmaterial> <separatedmaterial>
     * <bioghist> <langmaterial> <otherfindaid> <custodhist> <appraisal>
     * <processinfo> <accruals>
     *
     * Of course, "type" = "EAD" "key" = ? text = ? TODO figure this out
     *
     * Setting "key" to the filename for now...
     */
    static class EADHandler extends DefaultHandler {

        public void startElement(String uri, String localName, String qName,
                Attributes attributes) throws SAXException {

            // qName determines which field, or possible fields, the characters
            // go in

            if (qName.equals("controlaccess") | qName.equals("origination")) {
                nestedTag = true;
                return;
            }

            // If we're already in a nested tag that matters...
            // fieldName is either subject, author, or title
            else if (nestedTag) {
                if (qName.equals("subject") | qName.equals("genreform")
                        | qName.equals("occupation") | qName.equals("function")) {
                    fieldName = "subject";
                } else if (qName.equals("subject") | qName.equals("genreform")
                        | qName.equals("occupation") | qName.equals("function")
                        | qName.equals("genreform")) {
                    try {
                        if ((attributes.getValue("role"))
                                .equalsIgnoreCase("subject")) {
                            fieldName = "subject";
                        }
                    } catch (NullPointerException npe) {
                        // No attributes present! Move along!
                    }
                } else if (qName.equals("persname") | qName.equals("famname")
                        | qName.equals("corpname")) {
                    fieldName = "author";
                } else if (qName.equals("title")) {
                    fieldName = "title";
                }
            }

            // Could be a title even if not a nested tag
            else if (qName.equals("unittitle")) {
                fieldName = "title";
            }

            // Could be a date
            else if (qName.equals("unitdate")) {
                fieldName = "date";
            }

            // Or it could be a "note"...
            else if (qName.equals("odd") | qName.equals("note")
                    | qName.equals("accessrestrict")
                    | qName.equals("legalstatus")
                    | qName.equals("bibliography")
                    | qName.equals("scopecontent") | qName.equals("prefercite")
                    | qName.equals("altformavail")
                    | qName.equals("originalsloc") | qName.equals("sponsor")
                    | qName.equals("phystech") | qName.equals("userestrict")
                    | qName.equals("acqinfo") | qName.equals("relatedmaterial")
                    | qName.equals("separatedmaterial")
                    | qName.equals("bioghist") | qName.equals("langmaterial")
                    | qName.equals("otherfindaid") | qName.equals("custodhist")
                    | qName.equals("appraisal") | qName.equals("processinfo")
                    | qName.equals("accruals")) {

                fieldName = "note";
            }

            else
                return; // if no match is made, return without setting
                        // grabCharacters

            grabCharacters = true;
            nestedTag = false;

        }

        // Overriden to set nestedTag to 'false' if necessary
        public void endElement(String uri, String localName, String qName)
                throws SAXException {
            if (qName.equals("controlaccess") | qName.equals("origination")) {
                nestedTag = false;
                return;
            }
        }

        public void characters(char[] ch, int start, int length)
                throws SAXException {
            if (grabCharacters) {
                try {
                    datum.concatenateField(fieldName, new String(ch, start,
                            length)
                            + "\n\t");
                } catch (IOException e) {
                    System.err
                            .println("*** Saving parsed data to Index Datum failed! ***");
                    e.printStackTrace();
                } finally {
                    grabCharacters = false;
                }
            }
        }

    }

    // Parses an XML file using a SAX parser.
    // If validating is true, the contents is validated against the DTD
    // specified in the file.
    public static void parseXmlFile(String filename, DefaultHandler handler,
            boolean validating) {
        try { // Create a builder factory
            SAXParserFactory factory = SAXParserFactory.newInstance();
            factory.setValidating(validating); // Create the builder and parse
                                                // the file
            factory.newSAXParser().parse(new File(filename), handler);

        } catch (SAXException e) { // A parsing error occurred; the xml input is
                                    // not valid
            System.err.println("*** SAX Exception ***");
            e.getStackTrace();
            failFlag = true;
        } catch (ParserConfigurationException e) {
            System.err.println("*** Parser Configuration Exception ***");
            e.getStackTrace();
            failFlag = true;
        } catch (IOException e) {
            System.err.println("*** IO Exception ***");
            e.getStackTrace();
            failFlag = true;
        } // End try-catch
    }

}

/**
 * filter out all but *.html files
 */
class XmlFilter implements FilenameFilter {

    /**
     * Select only *.xml files.
     *
     * @param dir
     *            the directory in which the file was found.
     *
     * @param name
     *            the name of the file
     *
     * @return true if and only if the name should be included in the file list;
     *         false otherwise.
     */
    public boolean accept(File dir, String name) {
        if (new File(dir, name).isDirectory()) {
            return false;
        }
        name = name.toLowerCase();
        return name.endsWith(".xml");
    }
}

No comments:

Post a Comment