A Little Goes a Long Way: Project 2 - Beefing Up EAD File-Handling

This class is far more powerful than its predecessor, EADDataRetriever. Added functionality includes: sending data to multiple VuFind fields at once; handling multiple kinds of schema files and EAD-VuFind crosswalks, and better documentation!

package crrasolrindexer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.Set;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;

import org.apache.solr.client.solrj.SolrServerException;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
* @author slittle2
*
*         CRRA_EADRetriever does the same thing as EADDataRetriever, except
*         using the new CRRA_Datum class.
*
*         To this end, it cycles through the files in the given directory,
*         parses each, extracts the relevant data, and puts it in a
*         LinkedHashSet<CRRA_Datum>. Voila'!
*
*/

public class CRRA_EADRetriever {

    /**
    * @param args
    */

    // This is used only in testing; if called from TextUICRRASI, it uses whatever is in the given properties file.
    private static String testPathName = "C:/Documents and Settings/slittle2/Desktop/Index Data/ead/xml/";

    // The set of records to send to the Indexer
    private static LinkedHashSet<CRRA_Datum> eadRecords = new LinkedHashSet<CRRA_Datum>();
    private static int recordQuantity = 0; // The quantity of EAD records parsed
    private static boolean grabCharacters = false; // Whether or not to stuff data into fields besides "allfields"
    private static boolean failFlag = false; // Whether or not the parsing operation succeeded

    private static CRRA_Datum datum = new CRRA_Datum(); // Using default schema here

    // The user-defined mapping from EAD to VuFind will be stored here
    private static LinkedHashMap<LinkedHashSet<String>, String> schema_map = new LinkedHashMap<LinkedHashSet<String>, String>();

    private static LinkedList<String> currentFieldSet = new LinkedList<String>(); // for keeping track of the current fields to send data to
    private static LinkedList<String> tagStack = new LinkedList<String>(); // for keeping track of tag nesting while parsing

    // fieldName sets the field in datum that the parser passes data to,
    // and then datum is sent to eadRecords
    private static String fieldName = "";

    // Name for file with schema
    public static String schema_filename = "";

    // Whether to force parser to evaluate whether the tagStack and a given element "path" are identical before passing data
    public static boolean strictElementPaths = false;

    // Name for file with schema presets
    public static String schema_presets = "";

    // Stores the presets to add to each record right before moving on to the next parsing
    private static LinkedHashMap<String, String> presets_map = new LinkedHashMap<String, String>();

    // Main() included more or less for testing purposes
    public static void main(String[] args) throws IOException, SolrServerException {
        eadLoader(testPathName);

        System.out.println("Number of records = " + eadRecords.size());

        Iterator<CRRA_Datum> iter = eadRecords.iterator();
        while (iter.hasNext()) {
            datum = (CRRA_Datum) iter.next();

            System.out.println(datum.toString());

        }

        Indexer.indexCD(eadRecords, "http://localhost:8983/solr/core0/");

        System.out.println("Successfully indexed... we hope.");

    }

    // Cycling through the files in the directory and loading each in turn
    public static LinkedHashSet<CRRA_Datum> eadLoader(String pathname) throws IOException, SolrServerException {

        String filename = "";

        // Initialize variables -- must be cleared every time parser is run!
        recordQuantity = 0;
        failFlag = false;
        datum = new CRRA_Datum();
        eadRecords = new LinkedHashSet<CRRA_Datum>();

        // Read in schema file here, from filename indicated above.
        // MUST set schema_filename before calling!
        if(schema_filename.equalsIgnoreCase("")) throw new IOException();

        // Open schema_map file
        BufferedReader inFile = null; // create a new stream to open a file
        try {
            inFile = new BufferedReader((Reader) new FileReader(schema_filename));
            String data = " ";

            // Read in each line until termination
            while ((data = inFile.readLine()) != null) {
                // Split from last element of string
                String[] schema_entry = data.split(" ", 2);

                // Check for schema meta-info
                if (schema_entry[0].equalsIgnoreCase("strictElementPaths"))
                    strictElementPaths = true;
                else
                    // schema_map.put(addTagSet( - last word - ), - first part -
                    // );
                    schema_map.put(addTagSet(schema_entry[1]), schema_entry[0]);

            }
        } finally {
            if (inFile != null)
                inFile.close();
        }

        // Open schema presets file, if there is one
        if (!schema_presets.equalsIgnoreCase("")) {

            // Open schema_map file
            inFile = null; // create a new stream to open a file
            try {
                inFile = new BufferedReader((Reader) new FileReader(
                        schema_presets));
                String data = " ";

                // Read in each line until termination
                while ((data = inFile.readLine()) != null) {
                    // Split from last element of string
                    String[] schema_entry = data.split(" ", 2);

                    presets_map.put(schema_entry[0], schema_entry[1]);

                }
            } finally {
                if (inFile != null)
                    inFile.close();
            }
        }

        // Cycle through all files; XmlFilter (below) makes sure each is
        // an XML file.

        File directory = new File( pathname );

        String[] eadFiles = directory.list( new XmlFilter() );

        // for (int i = 0; i < eadFiles.length; i++) { // Uncomment this and comment out the following line
        for (int i = 0; i < 4; i++) { // Used to limit number of records parsed for test purposes

            filename = eadFiles[i];
            eadRecords.add(parse(pathname + filename)); // Returns all the EAD data
                                                        // from ONE file to
                                                        // eadRecords

            System.out.println("Successfully parsed " + pathname + filename + "!");

            datum = new CRRA_Datum(); // This is REALLY, REALLY IMPORTANT! Bad things will happen if it is deleted!
        }

        System.out.println("Number of records = " + eadRecords.size());

        return eadRecords;

    }

    // Used to create schema for parsing
    private static LinkedHashSet<String> addTagSet(String string) {

        // Parse tags from tagSet and add individually as elements to a LHS<S>

        LinkedHashSet<String> returnSet = new LinkedHashSet<String>();
        String[] tagSet = string.split(" ");

        for(int i = 0; i < tagSet.length; i++){
            returnSet.add((String) tagSet[i]);
        }

        return returnSet;
    }

    // To get the number of records found
    public static int getRecordQuantity() {
        return recordQuantity;
    }

    // To get the LinkedHashSet<CRRA_Datum> eadRecords
    public static LinkedHashSet<CRRA_Datum> getEadRecords() {
        return eadRecords;
    }

    // To return whether the parse succeeded
    public static boolean getFailFlag() {
        return failFlag;
    }

    // Parsing a given file into an CRRA_Datum
    private static CRRA_Datum parse(String filename)
            throws IOException {

        DefaultHandler handler = new EADHandler(); // Parse the file using the
                                                    // handler and given schema
        parseXmlFile(filename, handler, false);

        // Add preset fields
        // Get the keySet of presets_map
        Set<String> preset_fields = presets_map.keySet();
        // Iterate over the keySet and put all values into the fields of the new datum
        Iterator iter = preset_fields.iterator();
        while(iter.hasNext()){
            String field = (String) iter.next();
            datum.setField(field, presets_map.get(field));
        }

        // Open file for sending to 'fullrecord' field
        BufferedReader inFile = null; // create a new stream to open a file
        try {
            inFile = new BufferedReader((Reader) new FileReader(filename));
            String data = " ";
            while ((data = inFile.readLine()) != null) {
                datum.concatenateField("fullrecord", data);

            }
        } finally {
            if (inFile != null)
                inFile.close();
        }

        return datum;
    }

    public static CRRA_Datum returnCurrentCD(){
        return datum;
    }

    /*
    * EADHandler looks for the appropriate parts of the EAD record to grab.
    *
    * A stack (tagStack) is used to keep track of the element "pathname".
    * Every time a new element is reached, its name is put on the stack;
    * when a close-element is encountered, its name is popped off, along
    * with any elements "on top" of it (like <p> tags and the like).
    *
    * If stricElementPaths is 'true', then data will be collected for VuFind
    * if and only if the elements on the tagStack match exactly at least one
    * set of elements in the schema.
    *
    * The currentFieldSet is the set of VuFind fields to send the current data
    * to. Every time an element is encountered, the CFS gets wiped out and
    * recalculated from scratch. Inelegant, but effective.
    *
    * If the CFS is not empty, then grabCharacters is true and data will be sent
    * to at least one field.
    *
    */
    static class EADHandler extends DefaultHandler {

        public void startElement(String uri, String localName, String qName,
                Attributes attributes) throws SAXException {

            // qName determines which field, or possible fields, the characters
            // go in.

            // Add the tag name to the stack
            tagStack.addFirst(qName);

            // Initialize the CFS
            currentFieldSet = new LinkedList<String>();

            // Update the CFS to include only fields corresponding to the tags currently on the stack.
            updateCFS();

            if(currentFieldSet.isEmpty())
                grabCharacters = false;
            else
                grabCharacters = true;


        }

        // Overriden to remove closed tags from the tagStack and update the CFS.
        public void endElement(String uri, String localName, String qName)
                throws SAXException {

            // Removes any non-closed tags on the front of the stack, plus the closed tag.
            while(tagStack.contains(qName.toString())){
                tagStack.removeFirst();
            }

            updateCFS();

        }

        public void characters(char[] ch, int start, int length)
                throws SAXException {

            try {
                datum.concatenateField("allfields", new String(ch, start, length));
            } catch (IOException e1) {
                e1.printStackTrace();
            }

            if (grabCharacters) {
                try {

                    // Update each field in 'currentFieldSet' in the current 'datum'
                    Iterator cfsIter = currentFieldSet.iterator();
                    while(cfsIter.hasNext()){

                        fieldName = (String) cfsIter.next();

                    datum.concatenateField(fieldName, new String(ch, start,
                            length)
                            + "\n\t");
                    }

                    // Eliminates unnecessary whitespace
                    datum.setField(fieldName, datum.returnField(fieldName).trim());

                } catch (IOException e) {
                    System.err
                            .println("*** Saving parsed data to CRRA Datum failed! ***");
                    e.printStackTrace();
                } finally {
                    grabCharacters = false;
                }
            }
        }

    }

    // Parses an XML file using a SAX parser.
    // If validating is true, the contents is validated against the DTD
    // specified in the file.
    public static void parseXmlFile(String filename, DefaultHandler handler,
            boolean validating) {
        try { // Create a builder factory
            SAXParserFactory factory = SAXParserFactory.newInstance();
            factory.setValidating(validating); // Create the builder and parse
                                                // the file
            factory.newSAXParser().parse(new File(filename), handler);

        } catch (SAXException e) { // A parsing error occurred; the xml input is
                                    // not valid
            System.err.println("*** SAX Exception ***");
            e.getStackTrace();
            failFlag = true;
        } catch (ParserConfigurationException e) {
            System.err.println("*** Parser Configuration Exception ***");
            e.getStackTrace();
            failFlag = true;
        } catch (IOException e) {
            System.err.println("*** IO Exception in parseXmlFile ***");
            e.getStackTrace();
            failFlag = true;
        } // End try-catch
    }

    public static void updateCFS() {

        Set<LinkedHashSet<String>> keys = schema_map.keySet();
        Iterator tagIter = keys.iterator();
        while(tagIter.hasNext()){
            LinkedHashSet<String> tempSet = (LinkedHashSet<String>) tagIter.next();

            if(tagStack.containsAll((Collection<String>) tempSet) && (!strictElementPaths ||
                    (tempSet.containsAll((Collection<String>) tagStack)))){
                currentFieldSet.add(schema_map.get(tempSet));
            }
        }

    }

}

A Little Goes a Long Way

Wednesday, July 21, 2010

Project 2 - Beefing Up EAD File-Handling

No comments:

Post a Comment

Evolution

Followers