This class is far more powerful than its predecessor, EADDataRetriever. Added functionality includes: sending data to multiple VuFind fields at once; handling multiple kinds of schema files and EAD-VuFind crosswalks, and better documentation!
package crrasolrindexer;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import org.apache.solr.client.solrj.SolrServerException;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* @author slittle2
*
* CRRA_EADRetriever does the same thing as EADDataRetriever, except
* using the new CRRA_Datum class.
*
* To this end, it cycles through the files in the given directory,
* parses each, extracts the relevant data, and puts it in a
* LinkedHashSet<CRRA_Datum>. Voila'!
*
*/
public class CRRA_EADRetriever {
/**
* @param args
*/
// This is used only in testing; if called from TextUICRRASI, it uses whatever is in the given properties file.
private static String testPathName = "C:/Documents and Settings/slittle2/Desktop/Index Data/ead/xml/";
// The set of records to send to the Indexer
private static LinkedHashSet<CRRA_Datum> eadRecords = new LinkedHashSet<CRRA_Datum>();
private static int recordQuantity = 0; // The quantity of EAD records parsed
private static boolean grabCharacters = false; // Whether or not to stuff data into fields besides "allfields"
private static boolean failFlag = false; // Whether or not the parsing operation succeeded
private static CRRA_Datum datum = new CRRA_Datum(); // Using default schema here
// The user-defined mapping from EAD to VuFind will be stored here
private static LinkedHashMap<LinkedHashSet<String>, String> schema_map = new LinkedHashMap<LinkedHashSet<String>, String>();
private static LinkedList<String> currentFieldSet = new LinkedList<String>(); // for keeping track of the current fields to send data to
private static LinkedList<String> tagStack = new LinkedList<String>(); // for keeping track of tag nesting while parsing
// fieldName sets the field in datum that the parser passes data to,
// and then datum is sent to eadRecords
private static String fieldName = "";
// Name for file with schema
public static String schema_filename = "";
// Whether to force parser to evaluate whether the tagStack and a given element "path" are identical before passing data
public static boolean strictElementPaths = false;
// Name for file with schema presets
public static String schema_presets = "";
// Stores the presets to add to each record right before moving on to the next parsing
private static LinkedHashMap<String, String> presets_map = new LinkedHashMap<String, String>();
// Main() included more or less for testing purposes
public static void main(String[] args) throws IOException, SolrServerException {
eadLoader(testPathName);
System.out.println("Number of records = " + eadRecords.size());
Iterator<CRRA_Datum> iter = eadRecords.iterator();
while (iter.hasNext()) {
datum = (CRRA_Datum) iter.next();
System.out.println(datum.toString());
}
Indexer.indexCD(eadRecords, "http://localhost:8983/solr/core0/");
System.out.println("Successfully indexed... we hope.");
}
// Cycling through the files in the directory and loading each in turn
public static LinkedHashSet<CRRA_Datum> eadLoader(String pathname) throws IOException, SolrServerException {
String filename = "";
// Initialize variables -- must be cleared every time parser is run!
recordQuantity = 0;
failFlag = false;
datum = new CRRA_Datum();
eadRecords = new LinkedHashSet<CRRA_Datum>();
// Read in schema file here, from filename indicated above.
// MUST set schema_filename before calling!
if(schema_filename.equalsIgnoreCase("")) throw new IOException();
// Open schema_map file
BufferedReader inFile = null; // create a new stream to open a file
try {
inFile = new BufferedReader((Reader) new FileReader(schema_filename));
String data = " ";
// Read in each line until termination
while ((data = inFile.readLine()) != null) {
// Split from last element of string
String[] schema_entry = data.split(" ", 2);
// Check for schema meta-info
if (schema_entry[0].equalsIgnoreCase("strictElementPaths"))
strictElementPaths = true;
else
// schema_map.put(addTagSet( - last word - ), - first part -
// );
schema_map.put(addTagSet(schema_entry[1]), schema_entry[0]);
}
} finally {
if (inFile != null)
inFile.close();
}
// Open schema presets file, if there is one
if (!schema_presets.equalsIgnoreCase("")) {
// Open schema_map file
inFile = null; // create a new stream to open a file
try {
inFile = new BufferedReader((Reader) new FileReader(
schema_presets));
String data = " ";
// Read in each line until termination
while ((data = inFile.readLine()) != null) {
// Split from last element of string
String[] schema_entry = data.split(" ", 2);
presets_map.put(schema_entry[0], schema_entry[1]);
}
} finally {
if (inFile != null)
inFile.close();
}
}
// Cycle through all files; XmlFilter (below) makes sure each is
// an XML file.
File directory = new File( pathname );
String[] eadFiles = directory.list( new XmlFilter() );
// for (int i = 0; i < eadFiles.length; i++) { // Uncomment this and comment out the following line
for (int i = 0; i < 4; i++) { // Used to limit number of records parsed for test purposes
filename = eadFiles[i];
eadRecords.add(parse(pathname + filename)); // Returns all the EAD data
// from ONE file to
// eadRecords
System.out.println("Successfully parsed " + pathname + filename + "!");
datum = new CRRA_Datum(); // This is REALLY, REALLY IMPORTANT! Bad things will happen if it is deleted!
}
System.out.println("Number of records = " + eadRecords.size());
return eadRecords;
}
// Used to create schema for parsing
private static LinkedHashSet<String> addTagSet(String string) {
// Parse tags from tagSet and add individually as elements to a LHS<S>
LinkedHashSet<String> returnSet = new LinkedHashSet<String>();
String[] tagSet = string.split(" ");
for(int i = 0; i < tagSet.length; i++){
returnSet.add((String) tagSet[i]);
}
return returnSet;
}
// To get the number of records found
public static int getRecordQuantity() {
return recordQuantity;
}
// To get the LinkedHashSet<CRRA_Datum> eadRecords
public static LinkedHashSet<CRRA_Datum> getEadRecords() {
return eadRecords;
}
// To return whether the parse succeeded
public static boolean getFailFlag() {
return failFlag;
}
// Parsing a given file into an CRRA_Datum
private static CRRA_Datum parse(String filename)
throws IOException {
DefaultHandler handler = new EADHandler(); // Parse the file using the
// handler and given schema
parseXmlFile(filename, handler, false);
// Add preset fields
// Get the keySet of presets_map
Set<String> preset_fields = presets_map.keySet();
// Iterate over the keySet and put all values into the fields of the new datum
Iterator iter = preset_fields.iterator();
while(iter.hasNext()){
String field = (String) iter.next();
datum.setField(field, presets_map.get(field));
}
// Open file for sending to 'fullrecord' field
BufferedReader inFile = null; // create a new stream to open a file
try {
inFile = new BufferedReader((Reader) new FileReader(filename));
String data = " ";
while ((data = inFile.readLine()) != null) {
datum.concatenateField("fullrecord", data);
}
} finally {
if (inFile != null)
inFile.close();
}
return datum;
}
public static CRRA_Datum returnCurrentCD(){
return datum;
}
/*
* EADHandler looks for the appropriate parts of the EAD record to grab.
*
* A stack (tagStack) is used to keep track of the element "pathname".
* Every time a new element is reached, its name is put on the stack;
* when a close-element is encountered, its name is popped off, along
* with any elements "on top" of it (like <p> tags and the like).
*
* If stricElementPaths is 'true', then data will be collected for VuFind
* if and only if the elements on the tagStack match exactly at least one
* set of elements in the schema.
*
* The currentFieldSet is the set of VuFind fields to send the current data
* to. Every time an element is encountered, the CFS gets wiped out and
* recalculated from scratch. Inelegant, but effective.
*
* If the CFS is not empty, then grabCharacters is true and data will be sent
* to at least one field.
*
*/
static class EADHandler extends DefaultHandler {
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
// qName determines which field, or possible fields, the characters
// go in.
// Add the tag name to the stack
tagStack.addFirst(qName);
// Initialize the CFS
currentFieldSet = new LinkedList<String>();
// Update the CFS to include only fields corresponding to the tags currently on the stack.
updateCFS();
if(currentFieldSet.isEmpty())
grabCharacters = false;
else
grabCharacters = true;
}
// Overriden to remove closed tags from the tagStack and update the CFS.
public void endElement(String uri, String localName, String qName)
throws SAXException {
// Removes any non-closed tags on the front of the stack, plus the closed tag.
while(tagStack.contains(qName.toString())){
tagStack.removeFirst();
}
updateCFS();
}
public void characters(char[] ch, int start, int length)
throws SAXException {
try {
datum.concatenateField("allfields", new String(ch, start, length));
} catch (IOException e1) {
e1.printStackTrace();
}
if (grabCharacters) {
try {
// Update each field in 'currentFieldSet' in the current 'datum'
Iterator cfsIter = currentFieldSet.iterator();
while(cfsIter.hasNext()){
fieldName = (String) cfsIter.next();
datum.concatenateField(fieldName, new String(ch, start,
length)
+ "\n\t");
}
// Eliminates unnecessary whitespace
datum.setField(fieldName, datum.returnField(fieldName).trim());
} catch (IOException e) {
System.err
.println("*** Saving parsed data to CRRA Datum failed! ***");
e.printStackTrace();
} finally {
grabCharacters = false;
}
}
}
}
// Parses an XML file using a SAX parser.
// If validating is true, the contents is validated against the DTD
// specified in the file.
public static void parseXmlFile(String filename, DefaultHandler handler,
boolean validating) {
try { // Create a builder factory
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setValidating(validating); // Create the builder and parse
// the file
factory.newSAXParser().parse(new File(filename), handler);
} catch (SAXException e) { // A parsing error occurred; the xml input is
// not valid
System.err.println("*** SAX Exception ***");
e.getStackTrace();
failFlag = true;
} catch (ParserConfigurationException e) {
System.err.println("*** Parser Configuration Exception ***");
e.getStackTrace();
failFlag = true;
} catch (IOException e) {
System.err.println("*** IO Exception in parseXmlFile ***");
e.getStackTrace();
failFlag = true;
} // End try-catch
}
public static void updateCFS() {
Set<LinkedHashSet<String>> keys = schema_map.keySet();
Iterator tagIter = keys.iterator();
while(tagIter.hasNext()){
LinkedHashSet<String> tempSet = (LinkedHashSet<String>) tagIter.next();
if(tagStack.containsAll((Collection<String>) tempSet) && (!strictElementPaths ||
(tempSet.containsAll((Collection<String>) tagStack)))){
currentFieldSet.add(schema_map.get(tempSet));
}
}
}
}
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment