A Little Goes a Long Way: Project 2

This is largely the same as in Project 1, though cleaned up a bit.

/**
*
*/
package crrasolrindexer;

import java.io.*;
import java.util.*;

import org.apache.solr.client.solrj.SolrServerException;
import org.marc4j.marc.*;
import org.marc4j.*;

/**
* @author slittle2
*
*         MarcDataRetriever retrieves data from a MARC file and stores it in a
*         LinkedHashSet of IndexDatum objects.
*
*            For ease of use, one could modify the code to use CRRA_Datum instead.
*
*/
public class MarcDataRetriever {

    /**
    * @param args
    */

    // Default file name for retrieving MARC record. Used for test purposes.
    private static String marcFile = "C:/Documents and Settings/slittle2/Desktop/Index Data/crra.marc";

    // main() routine for testing.
    public static void main(String[] args) throws IOException, SolrServerException {

       LinkedHashSet<IndexDatum> recordSet = new LinkedHashSet<IndexDatum>();
       IndexDatum singleRecord = new IndexDatum();

       recordSet = getMarcData(marcFile);
       Iterator<IndexDatum> iter = recordSet.iterator();
       while (iter.hasNext()) {
           singleRecord = (IndexDatum) iter.next();

           System.out.println(singleRecord.returnField("title"));
           System.out.println(singleRecord.returnField("date"));
           System.out.println(singleRecord.returnField("author"));
           System.out.println(singleRecord.returnField("key"));
           System.out.println(singleRecord.returnField("subject"));
           System.out.println(singleRecord.returnField("note"));
           System.out.println(singleRecord.returnField("type"));
           System.out.println(singleRecord.returnField("text"));

       }

       Indexer.indexID(recordSet, "http://localhost:8983/solr/core0/");

       System.out.println("Successfully indexed... we hope.");

    }

    // Returns a single IndexDatum object with data extracted from MARC record
    private static IndexDatum extractMarcRecord(Record record) {
       IndexDatum datum = new IndexDatum();
       String input = null;

       try {

           // Extract title
           input = getMarcData(record, "245", 'a');
           input += getMarcData(record, "245", 'b');
           input += getMarcData(record, "245", 'c');
           input += getMarcData(record, "245", 'n');
           input += getMarcData(record, "245", 'p');
           datum.setField("title", input);

           // Extract date
           input = getMarcData(record, "260", 'c');
           datum.setField("date", input);

           // Extract notes
           input = "";
           for (int i = 500; i < 600; i++) {
               String str = (new Integer(i)).toString();
               input += getMarcData(record, str, 'a');
           }
           datum.setField("note", input);

           // Extract key
           input = getMarcData(record, "001");
           datum.setField("key", input);

           // Extract author
           input = getMarcData(record, "100", 'a');
           input += getMarcData(record, "100", 'b');
           input += getMarcData(record, "100", 'c');
           input += getMarcData(record, "110", 'a');
           input += getMarcData(record, "111", 'a');
           datum.setField("author", input);

           // Extract subjects
           input = "";
           for (int i = 600; i < 700; i++) {
               String str = (new Integer(i)).toString();
               input += getMarcData(record, str, 'a');
           }
           datum.setField("subject", input);

           // Set remaining fields: text, type
           datum.setField("text", "");
           datum.setField("type", "MARC");

       } catch (IOException e) {
           System.err.println("*** IO Exception while setting IndexDatum ***");
           e.getStackTrace();
       }

       return datum;
    }

    // To retrieve data from a DataField, which does have subfields
    private static String getMarcData(Record record, String fieldIndex,
           char subfieldIndex) {
       String newStringDatum = "";

       DataField field = (DataField) record.getVariableField(fieldIndex);
       Subfield subfield;

       try {
           subfield = field.getSubfield(subfieldIndex);
           newStringDatum = subfield.getData();

       } catch (NullPointerException npe) {
           newStringDatum = " ";
       }

       return newStringDatum;
    }

    // To retrieve data from a ControlField, which has no subfields
    private static String getMarcData(Record record, String fieldIndex) {
       String newStringDatum = "";

       ControlField field = (ControlField) record.getVariableField(fieldIndex);

       try {
           newStringDatum = field.getData();

       } catch (NullPointerException npe) {
           newStringDatum = " ";
       }

       return newStringDatum;
    }

    // Opens a file of MARC records, creates an iterator over it,
    // reads in the appropriate data, and saves it to an IndexDatum
    // which is then added to the LinkedHashSet
    public static LinkedHashSet<IndexDatum> getMarcData(String fileName)
           throws IOException {

       // LinkedHashSet for storing IndexDatum object
       LinkedHashSet<IndexDatum> MARCRecords = new LinkedHashSet<IndexDatum>();
       InputStream in = null;

       // Open the file & create the iterator
       try {
           in = new FileInputStream(fileName);
           MarcReader reader = new MarcStreamReader(in);

           // As long as there are unread records, read the data
           while (reader.hasNext()) {
               Record record = reader.next();
               IndexDatum datum = null;

               // Save the data to an IndexDatum
               datum = extractMarcRecord(record);

               // Add the IndexDatum to the Set
               MARCRecords.add(datum);
           } // end while

       } catch (FileNotFoundException e) {
           System.err.println("*** File Not Found ***");
       } finally {

           // Close input/output streams
           if (in != null)
               in.close();

       }
       // Return the LinkedHashSet
       return MARCRecords;

    }

}

A Little Goes a Long Way

Wednesday, July 21, 2010

Project 2 - Parsing MARC Data

No comments:

Post a Comment

Evolution

Followers