Wednesday, July 21, 2010

Project 2 - Parsing MARC Data

This is largely the same as in Project 1, though cleaned up a bit.

/**
 *
 */
package crrasolrindexer;

import java.io.*;
import java.util.*;

import org.apache.solr.client.solrj.SolrServerException;
import org.marc4j.marc.*;
import org.marc4j.*;

/**
 * @author slittle2
 *
 *         MarcDataRetriever retrieves data from a MARC file and stores it in a
 *         LinkedHashSet of IndexDatum objects.
 *
 *             For ease of use, one could modify the code to use CRRA_Datum instead.
 *
 */
public class MarcDataRetriever {

    /**
     * @param args
     */

    // Default file name for retrieving MARC record. Used for test purposes.
    private static String marcFile = "C:/Documents and Settings/slittle2/Desktop/Index Data/crra.marc";

    // main() routine for testing.
    public static void main(String[] args) throws IOException, SolrServerException {

        LinkedHashSet<IndexDatum> recordSet = new LinkedHashSet<IndexDatum>();
        IndexDatum singleRecord = new IndexDatum();

        recordSet = getMarcData(marcFile);
        Iterator<IndexDatum> iter = recordSet.iterator();
        while (iter.hasNext()) {
            singleRecord = (IndexDatum) iter.next();

            System.out.println(singleRecord.returnField("title"));
            System.out.println(singleRecord.returnField("date"));
            System.out.println(singleRecord.returnField("author"));
            System.out.println(singleRecord.returnField("key"));
            System.out.println(singleRecord.returnField("subject"));
            System.out.println(singleRecord.returnField("note"));
            System.out.println(singleRecord.returnField("type"));
            System.out.println(singleRecord.returnField("text"));

        }
        
        Indexer.indexID(recordSet, "http://localhost:8983/solr/core0/");
        
        System.out.println("Successfully indexed... we hope.");
        
    }

    // Returns a single IndexDatum object with data extracted from MARC record
    private static IndexDatum extractMarcRecord(Record record) {
        IndexDatum datum = new IndexDatum();
        String input = null;

        try {

            // Extract title
            input = getMarcData(record, "245", 'a');
            input += getMarcData(record, "245", 'b');
            input += getMarcData(record, "245", 'c');
            input += getMarcData(record, "245", 'n');
            input += getMarcData(record, "245", 'p');
            datum.setField("title", input);

            // Extract date
            input = getMarcData(record, "260", 'c');
            datum.setField("date", input);

            // Extract notes
            input = "";
            for (int i = 500; i < 600; i++) {
                String str = (new Integer(i)).toString();
                input += getMarcData(record, str, 'a');
            }
            datum.setField("note", input);

            // Extract key
            input = getMarcData(record, "001");
            datum.setField("key", input);

            // Extract author
            input = getMarcData(record, "100", 'a');
            input += getMarcData(record, "100", 'b');
            input += getMarcData(record, "100", 'c');
            input += getMarcData(record, "110", 'a');
            input += getMarcData(record, "111", 'a');
            datum.setField("author", input);

            // Extract subjects
            input = "";
            for (int i = 600; i < 700; i++) {
                String str = (new Integer(i)).toString();
                input += getMarcData(record, str, 'a');
            }
            datum.setField("subject", input);

            // Set remaining fields: text, type
            datum.setField("text", "");
            datum.setField("type", "MARC");

        } catch (IOException e) {
            System.err.println("*** IO Exception while setting IndexDatum ***");
            e.getStackTrace();
        }

        return datum;
    }

    // To retrieve data from a DataField, which does have subfields
    private static String getMarcData(Record record, String fieldIndex,
            char subfieldIndex) {
        String newStringDatum = "";

        DataField field = (DataField) record.getVariableField(fieldIndex);
        Subfield subfield;

        try {
            subfield = field.getSubfield(subfieldIndex);
            newStringDatum = subfield.getData();

        } catch (NullPointerException npe) {
            newStringDatum = " ";
        }

        return newStringDatum;
    }

    // To retrieve data from a ControlField, which has no subfields
    private static String getMarcData(Record record, String fieldIndex) {
        String newStringDatum = "";

        ControlField field = (ControlField) record.getVariableField(fieldIndex);

        try {
            newStringDatum = field.getData();

        } catch (NullPointerException npe) {
            newStringDatum = " ";
        }

        return newStringDatum;
    }

    // Opens a file of MARC records, creates an iterator over it,
    // reads in the appropriate data, and saves it to an IndexDatum
    // which is then added to the LinkedHashSet
    public static LinkedHashSet<IndexDatum> getMarcData(String fileName)
            throws IOException {

        // LinkedHashSet for storing IndexDatum object
        LinkedHashSet<IndexDatum> MARCRecords = new LinkedHashSet<IndexDatum>();
        InputStream in = null;

        // Open the file & create the iterator
        try {
            in = new FileInputStream(fileName);
            MarcReader reader = new MarcStreamReader(in);

            // As long as there are unread records, read the data
            while (reader.hasNext()) {
                Record record = reader.next();
                IndexDatum datum = null;

                // Save the data to an IndexDatum
                datum = extractMarcRecord(record);

                // Add the IndexDatum to the Set
                MARCRecords.add(datum);
            } // end while

        } catch (FileNotFoundException e) {
            System.err.println("*** File Not Found ***");
        } finally {

            // Close input/output streams
            if (in != null)
                in.close();

        }
        // Return the LinkedHashSet
        return MARCRecords;

    }

}
 

No comments:

Post a Comment