This does largely the same thing as the previous file, except it handles EAD files instead of a MARC database. There are significant differences as a result, like code to cycle through the .xml files in a directory.
/**
*
*/
package crrasolrindexer;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedHashSet;
import javax.xml.parsers.*;
import org.apache.solr.client.solrj.SolrServerException;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
/**
* @author slittle2
*
* *** NOTE: This is older than CRRA_EADRetriever. It is recommended ***
* *** that you use CRRA_EADRetriever instead. ***
* ***
*
* EADDataRetreiver does the same thing as MarcDataRetriever, except
* that it does it on EAD files (XML format).
*
* To this end, it cycles through the files in the given directory,
* parses each, extracts the relevant data, and puts it in a
* LinkedHashSet<IndexDatum>. Voila'!
*
* TODO Doesn't distinguish all <unittitle> uses from the main one; others probably unnecessary
* TODO Some other fields probably useful, too.
* TODO Not all field, like <bioghist>, apparently grabbed correctly? Need to skip "head" elements, etc.
*
*/
public class EADDataRetriever {
/**
* @param args
*/
private static String testPathName = "C:/Documents and Settings/slittle2/Desktop/Index Data/ead/xml/";
private static LinkedHashSet<IndexDatum> eadRecords = new LinkedHashSet<IndexDatum>();
private static int recordQuantity = 0;
private static boolean grabCharacters = false;
private static boolean failFlag = false;
private static boolean nestedTag = false;
private static IndexDatum datum = new IndexDatum();
// fieldName sets the field in datum that the parser passes data to,
// and then datum is sent to eadRecords
private static String fieldName = "";
// Main() included more or less for testing purposes
public static void main(String[] args) throws IOException, SolrServerException {
eadLoader(testPathName);
System.out.println("Number of records = " + eadRecords.size());
Iterator<IndexDatum> iter = eadRecords.iterator(); // TODO remove test code
while (iter.hasNext()) {
datum = (IndexDatum) iter.next();
// System.out.println(datum.toString()); // TODO remove test code
}
// Indexer.indexID(eadRecords, "http://localhost:8983/solr/core0/");
// System.out.println("Successfully indexed... we hope.");
}
// Cycling through the files in the directory and loading each in turn
public static LinkedHashSet<IndexDatum> eadLoader(String pathname) throws IOException, SolrServerException {
String filename = "";
// Initialize variables -- must be cleared every time parser is run!
recordQuantity = 0;
grabCharacters = false;
failFlag = false;
nestedTag = false;
datum = new IndexDatum();
eadRecords = new LinkedHashSet<IndexDatum>();
// Cycle through all files; XmlFilter (below) makes sure each is
// an XML file.
File directory = new File( pathname );
String[] eadFiles = directory.list( new XmlFilter() );
// for (int i = 0; i < eadFiles.length; i++) { // TODO insert this into code, eliminating test below
for (int i = 0; i < 4; i++) {
filename = eadFiles[i];
eadRecords.add(parse(pathname + filename)); // Returns all the EAD data
// from ONE file to
// eadRecords
System.out.println("Successfully parsed " + filename + "!"); //TODO remove test code
datum = new IndexDatum(); // This is REALLY, REALLY IMPORTANT! Bad things will happen if it is deleted!
}
// TODO test code
Iterator<IndexDatum> iter = eadRecords.iterator();
IndexDatum singleRecord = new IndexDatum();
/* while (iter.hasNext()) { // Add each IndexDatum to the Index
singleRecord = (IndexDatum) iter.next();
System.out.println(singleRecord.toString()); // TODO remove
}*/
System.out.println("Number of records = " + eadRecords.size());
// Indexer.indexID(eadRecords, "http://localhost:8983/solr/core0/"); // This is a hack--fix later TODO
return eadRecords;
}
// To get the number of records found
public static int getRecordQuantity() {
return recordQuantity;
}
// To get the LinkedHashSet<IndexDatum> eadRecords
public static LinkedHashSet<IndexDatum> getEadRecords() {
return eadRecords;
}
// To return whether the parse succeeded
public static boolean getFailFlag() {
return failFlag;
}
// Parsing a given file into an IndexDatum
private static IndexDatum parse(String filename)
throws IOException {
DefaultHandler handler = new EADHandler(); // Parse the file using the
// handler
parseXmlFile(filename, handler, false);
datum.setField("key", filename); //TODO this is temporary until we find a better key!
datum.setField("text", "INSERT TEXT HERE"); // TODO change this to actual text
datum.setField("type", "EAD");
// datum = new IndexDatum(); // This is REALLY, REALLY IMPORTANT! Bad things will happen if it is deleted!
return datum;
}
/*
* EADHandler looks for the appropriate parts of the EAD record to grab:
*
* -> "title" <unittitle> <controlaccess><title>
*
* -> "subject" <controlaccess> <title role="subject"> <persname
* role="subject"> <famname role="subject"> <corpname role="subject">
* <subject> <subject source="local"> // Do I need to distinguish this from
* the previous one? <geogname role="subject"> <genreform> <occupation>
* <function>
*
* -> "author" <origination> <persname> <famname> <corpname> <controlaccess>
* <persname> <famname> <corpname>
*
* -> "date" <unitdate>
*
* -> "note" <odd> <note> <accessrestrict> <legalstatus> <bibliography>
* <scopecontent> <prefercite> <altformavail> <originalsloc> <sponsor>
* <phystech> <userestrict> <acqinfo> <relatedmaterial> <separatedmaterial>
* <bioghist> <langmaterial> <otherfindaid> <custodhist> <appraisal>
* <processinfo> <accruals>
*
* Of course, "type" = "EAD" "key" = ? text = ? TODO figure this out
*
* Setting "key" to the filename for now...
*/
static class EADHandler extends DefaultHandler {
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
// qName determines which field, or possible fields, the characters
// go in
if (qName.equals("controlaccess") | qName.equals("origination")) {
nestedTag = true;
return;
}
// If we're already in a nested tag that matters...
// fieldName is either subject, author, or title
else if (nestedTag) {
if (qName.equals("subject") | qName.equals("genreform")
| qName.equals("occupation") | qName.equals("function")) {
fieldName = "subject";
} else if (qName.equals("subject") | qName.equals("genreform")
| qName.equals("occupation") | qName.equals("function")
| qName.equals("genreform")) {
try {
if ((attributes.getValue("role"))
.equalsIgnoreCase("subject")) {
fieldName = "subject";
}
} catch (NullPointerException npe) {
// No attributes present! Move along!
}
} else if (qName.equals("persname") | qName.equals("famname")
| qName.equals("corpname")) {
fieldName = "author";
} else if (qName.equals("title")) {
fieldName = "title";
}
}
// Could be a title even if not a nested tag
else if (qName.equals("unittitle")) {
fieldName = "title";
}
// Could be a date
else if (qName.equals("unitdate")) {
fieldName = "date";
}
// Or it could be a "note"...
else if (qName.equals("odd") | qName.equals("note")
| qName.equals("accessrestrict")
| qName.equals("legalstatus")
| qName.equals("bibliography")
| qName.equals("scopecontent") | qName.equals("prefercite")
| qName.equals("altformavail")
| qName.equals("originalsloc") | qName.equals("sponsor")
| qName.equals("phystech") | qName.equals("userestrict")
| qName.equals("acqinfo") | qName.equals("relatedmaterial")
| qName.equals("separatedmaterial")
| qName.equals("bioghist") | qName.equals("langmaterial")
| qName.equals("otherfindaid") | qName.equals("custodhist")
| qName.equals("appraisal") | qName.equals("processinfo")
| qName.equals("accruals")) {
fieldName = "note";
}
else
return; // if no match is made, return without setting
// grabCharacters
grabCharacters = true;
nestedTag = false;
}
// Overriden to set nestedTag to 'false' if necessary
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (qName.equals("controlaccess") | qName.equals("origination")) {
nestedTag = false;
return;
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
if (grabCharacters) {
try {
datum.concatenateField(fieldName, new String(ch, start,
length)
+ "\n\t");
} catch (IOException e) {
System.err
.println("*** Saving parsed data to Index Datum failed! ***");
e.printStackTrace();
} finally {
grabCharacters = false;
}
}
}
}
// Parses an XML file using a SAX parser.
// If validating is true, the contents is validated against the DTD
// specified in the file.
public static void parseXmlFile(String filename, DefaultHandler handler,
boolean validating) {
try { // Create a builder factory
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setValidating(validating); // Create the builder and parse
// the file
factory.newSAXParser().parse(new File(filename), handler);
} catch (SAXException e) { // A parsing error occurred; the xml input is
// not valid
System.err.println("*** SAX Exception ***");
e.getStackTrace();
failFlag = true;
} catch (ParserConfigurationException e) {
System.err.println("*** Parser Configuration Exception ***");
e.getStackTrace();
failFlag = true;
} catch (IOException e) {
System.err.println("*** IO Exception ***");
e.getStackTrace();
failFlag = true;
} // End try-catch
}
}
/**
* filter out all but *.html files
*/
class XmlFilter implements FilenameFilter {
/**
* Select only *.xml files.
*
* @param dir
* the directory in which the file was found.
*
* @param name
* the name of the file
*
* @return true if and only if the name should be included in the file list;
* false otherwise.
*/
public boolean accept(File dir, String name) {
if (new File(dir, name).isDirectory()) {
return false;
}
name = name.toLowerCase();
return name.endsWith(".xml");
}
}
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment