The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/**
 * <p>Title: SerializeTralex.java</p>
 * <p>Description: Convert a translation lexicon to serialized form</p>
 * <p>Copyright: Copyright (C) 2004 Ali Argyle</p>
 * <p>Company: Department of Computer Science, New York University</p>
 * @author Ali Argyle
 * @version 0.2
 *
 */

// To cut down the time that it takes to run a large number of files
// through GMA, a serialized version of the translation lexicon can
// be provided to GMA (optional).  This program takes in a tralex and
// creates the serialized version.

// Step 1. You will need to run from the main gma directory 
//         cd pathtogma/GMA/
// Step 2. Setup your classpath so gma utilities and tools dir
//         can be reached:
//         export CLASSPATH=lib/gma.jar:tools/
// Step 3. compile the code
//         javac tools/SerializeTralex.java
// Step 4. Run with a tralex(required) and stopfiles(optional)
//         java SerializeTralex -tralex rc/ME.tralex -xstop 
//             rc/malay.stoplist -ystop rc/english.stoplist
// Step 5. The new tralex file will be placed in the same dir
//         as the original with .serial extension.  You can now 
//         modify the config file to use the .serial file
// Note:  Do not change the extension, if the file does not end 
//        in .serial GMA will treat it as a regular file.
import java.io.*;
import java.util.Properties;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import gma.util.ByteInputFileHandler;
import gma.util.ByteParser;


public class SerializeTralex {
    //constants for command line arguments and property file
    public static final String DASH = "-";
    public static final String SPACE = " ";
    public static final String TRALEX = "tralex";
    public static final String XSTOP = "xstop";
    public static final String YSTOP = "ystop";
    public static final String SERIAL_FILE = "serialFile";

    public static final String TRANSLATION_LEXICON = "translationLexicon";
    public Map hTrans = new HashMap(); // non-english keys
    public Map vTrans = new HashMap();   // english keys
    public List xStopWords = new ArrayList();
    public List yStopWords = new ArrayList();
    public Properties properties = new Properties(); //properties
    public String translationLexiconFile = new String();
    public boolean hasxstop = false;
    public boolean hasystop = false;
    

    /**
     * Constructor.
     * @param args                  command line arguments
     */
    public SerializeTralex(String[] args) {
	try {
	    parseArguments(args);
	} catch (IllegalArgumentException e) {
	    printUsage();
	    System.out.println(e.getMessage());
	    System.exit(1);
	}
    }
	
    /**
     * Constructor.
     * @param properties            properties
     */
    public SerializeTralex(Properties properties) {
	this.properties = properties;
    }


    /**
     * Parses command line arguments.
     * @param args                              command line arguments
     * @throws IllegalArgumentException
     */
    private void parseArguments(String[] args) throws IllegalArgumentException {
	boolean gotTralex = false;
	if ((args.length % 2 ) != 0) {
	    throw new IllegalArgumentException("The number of arguments must be even.");
	}	
	for (int index = 0; index < args.length; index++) {

	    if (args[index].equals(DASH + TRALEX)) {
		properties.put(TRALEX, args[++index]);
		gotTralex = true;

	    } else if (args[index].equals(DASH + XSTOP)) {
		properties.put(XSTOP, args[++index]);
		
	    } else if (args[index].equals(DASH + YSTOP)) {
		properties.put(YSTOP, args[++index]);
	    
	    } else if (args[index].equals(DASH + SERIAL_FILE)) {
		properties.put(SERIAL_FILE, args[++index]);

	    } else {
		throw new IllegalArgumentException(args[index] + " is an invalid argument.");
	    }
	}
	if (gotTralex == false) {
	    throw new IllegalArgumentException("Tralex file must be specified at the command line.");
	}

    }

    /**
     * Forms command line argument usage.
     * @param argument              command line argument
     * @param isRequired            true for required argument
     * @param example               example command line argument
     * @return                      command line argument usage
     */
    private String formArgumentUsage(String argument, boolean isRequired, String example) {
	StringBuffer buffer = new StringBuffer();
	buffer.append("\t").append(SerializeTralex.DASH).append(argument).append(SerializeTralex.SPACE).append(argument).append("\n");
	if (isRequired) {
	    buffer.append("\t").append("required argument; ");
	} else {
	    buffer.append("\t").append("optional argument; ");
	}
	buffer.append("e.g., ").append(SerializeTralex.DASH).append(argument).append(SerializeTralex.SPACE).append(example).append("\n\n");
	return buffer.toString();
    }



    /**
     * Prints command usage.
     */
    private void printUsage() {
	StringBuffer buffer = new StringBuffer("Usage: java SerializeTralex [arguments]\n\n");
	buffer.append("where [arguments] are:\n\n");
	buffer.append(formArgumentUsage(SerializeTralex.TRALEX, true, "./tralex.O.E"));
	buffer.append(formArgumentUsage(SerializeTralex.XSTOP, false, "./french.stoplist"));
	buffer.append(formArgumentUsage(SerializeTralex.YSTOP, false, "./english.stoplist"));
	//	buffer.append(formArgumentUsage(SIMR.SIMR + "." + SIMR.OUTPUT_FILE, false, "./simrOutput.txt"));
	System.err.println(buffer.toString());
    }




    /**
     * Loads stop word list.
     * @param properties                  properties
     * @param propertyName                property name for stop word file
     * @return
     */
    private List loadStopWordList(Properties properties, String propertyName) {
	String stopWordFile = properties.getProperty(propertyName);
	ByteInputFileHandler input = new ByteInputFileHandler(stopWordFile);
	return input.readWordList();
    }

    public void readFiles() {
	System.err.println("Reading in the lexicon...(may take a while)");
	if (properties.getProperty(XSTOP) != null) {
	    xStopWords = loadStopWordList(properties, XSTOP);
	} 

	if (properties.getProperty(YSTOP) != null) {
	    yStopWords = loadStopWordList(properties, YSTOP);
	}
	translationLexiconFile = properties.getProperty(TRALEX);
	ByteInputFileHandler input = new ByteInputFileHandler(translationLexiconFile);

	// go through each line in the dictionary
	OUTER: while (input.hasLine()) {
	    List dictLine = input.nextLine();
	    ByteParser bParser = new ByteParser(dictLine);
	    List pairList = bParser.parseDictionaryLine();
	
	    if (pairList.size() != 2) {
		System.err.println("The input file is not in the correct translation lexicon format.");
		input.close();
		System.exit(1);
	    }
	    List from = (List)pairList.get(0);

	if (xStopWords.contains(from)) {
	    continue OUTER;
	} else {
	    // put second word in 'to' as a list
	    List to = (List)pairList.get(1);
	    if (!yStopWords.contains(to)) {
		//         add the pair to the dictionary
		//         hTrans (non-english keys)
		//         ------------------------------
		if ((List)hTrans.get(from) ==null) {
		    List toList = new ArrayList();
		    toList.add(to);
		    hTrans.put(from,toList);
		} else {
		    List largerList = (List)hTrans.get(from);
		    if (!largerList.contains(to)) {
			largerList.add(to);
			hTrans.put(from,largerList);
		    }
		}
		//         add the pair to the dictionary
		//         vTrans  (in the other direction)
		//         ------------------------------
		if ((List)vTrans.get(to) ==null) {
		    List fromList = new ArrayList();
		    fromList.add(from);
		    vTrans.put(to,fromList);
		} else {
		    List largerList = (List)vTrans.get(to);
		    if (!largerList.contains(from)) {
			//List largerList = new ArrayList();
			
			largerList.add(from);
			vTrans.put(to,largerList);
		    }
		}
	    } 
	}
	
	} // Matches OUTER: while
    }


    public void writeFile() {

	System.err.println("Now putting the lexicon into serial form (YUM ...  cereal!)");
	File serializedFile = new File(translationLexiconFile + ".serial");
	System.err.println("Writting to file : " + serializedFile);
	FileOutputStream outStream;
	ObjectOutputStream objStream; 
	try {
	    // setup a stream to a physical file on the filesystem
	    outStream = new FileOutputStream(serializedFile);

	    // attach a stream capable of writing objects to the stream that is
	    // connected to the file
	    objStream = new ObjectOutputStream(outStream);
	
	    // write out the two lexicon data structures
	    objStream.writeObject(hTrans);
	    objStream.writeObject(vTrans);
	
	    // close down the streams
	    objStream.close();
	    outStream.close();

	}  catch(IOException e) {
	    System.err.println("Serialized lexicon not in the right format.");
	    e.printStackTrace();
	} catch(ClassCastException e) {
	    // end up here if one of the objects were read wrong
	    System.err.println("Cast didn't work quite right.");
	    e.printStackTrace();
	}   // catch  

	System.err.println("Done with serialize operation");
    }

    /**
     * Main method.
     * @param args                        command line arguments
     */
    public static void main (String[] args) {
	SerializeTralex serialize = new SerializeTralex(args);
	serialize.readFiles();
	serialize.writeFile();
	System.exit(1);
	


}


}