package gma.gsa;
/**
* <p>Title: </p>
* <p>Description: GSA implements GSA algorithm to line up bitext, based on the bitext correspondence generated by SIMR algorithm.</p>
* <p>Copyright: Copyright (C) 2004 I. Dan Melamed</p>
* <p>Company: Department of Computer Science, New York University</p>
* @author Luke Shen
*/
import gma.AxisTick;
import gma.BitextSpace;
import gma.MapPoint;
import gma.gcalign.GCalign;
import gma.simr.SIMR;
import gma.util.InputFileHandler;
import gma.util.OutputFileHandler;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.StringTokenizer;
public class GSA {
// constants in property file
public static final String GSA = "gsa";
public static final String BACK_OFF = "backOff";
public Properties properties = new Properties(); //properties
Map xToYSegmentMap = new HashMap();
Map yToXSegmentMap = new HashMap();
Map leftHash = new HashMap();
Map rightHash = new HashMap();
/**
* Constructor.
* @param args command line arguments
*/
public GSA(String[] args) {
try {
parseArguments(args);
} catch (IllegalArgumentException e) {
printUsage();
System.exit(1);
}
}
/**
* Constructor.
* @param properties properties
*/
public GSA(Properties properties) {
this.properties = properties;
}
/**
* Forms command line argument usage.
* @param argument command line argument
* @param isRequired true for required argument
* @param example example command line argument
* @return command line argument usage
*/
private String formArgumentUsage(String argument, boolean isRequired, String example) {
StringBuffer buffer = new StringBuffer();
buffer.append("\t").append(SIMR.DASH).append(argument).append(SIMR.SPACE).append(argument).append("\n");
if (isRequired) {
buffer.append("\t").append("required argument; ");
} else {
buffer.append("\t").append("optional argument; ");
}
buffer.append("e.g., ").append(SIMR.DASH).append(argument).append(SIMR.SPACE).append(example).append("\n\n");
return buffer.toString();
}
/**
* Prints command usage.
*/
private void printUsage() {
StringBuffer buffer = new StringBuffer("Usage: java gma.gsa.GSA [arguments]\n\n");
buffer.append("where [arguments] are:\n\n");
buffer.append(formArgumentUsage(SIMR.PROPERTIES, true, "./GSA.properties"));
buffer.append(formArgumentUsage(SIMR.X_AXIS_FILE, false, "./french.txt"));
buffer.append(formArgumentUsage(SIMR.Y_AXIS_FILE, false, "./english.txt"));
buffer.append(formArgumentUsage(SIMR.SIMR + "." + SIMR.OUTPUT_FILE, false, "./simrOutput.txt"));
buffer.append(formArgumentUsage(GSA + "." + SIMR.OUTPUT_FILE, false, "./gsaOutput.txt"));
System.err.println(buffer.toString());
}
/**
* Parses command line arguments.
* @param args command line arguments
* @throws IllegalArgumentException
*/
private void parseArguments(String[] args) throws IllegalArgumentException {
InputStream in = null;
if ((args.length % 2) != 0) {
throw new IllegalArgumentException("The number of arguments must be even.");
}
for (int index = 0; index < args.length; index++) {
if (args[index].equals(SIMR.DASH + SIMR.PROPERTIES)) {
try {
in = new BufferedInputStream(new FileInputStream(args[++index]));
properties.load(in);
} catch (FileNotFoundException e) {
e.printStackTrace();
System.exit(1);
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
} else if (args[index].equals(SIMR.DASH + SIMR.X_AXIS_FILE)) {
properties.put(SIMR.X_AXIS_FILE, args[++index]);
} else if (args[index].equals(SIMR.DASH + SIMR.Y_AXIS_FILE)) {
properties.put(SIMR.Y_AXIS_FILE, args[++index]);
} else if (args[index].equals(SIMR.DASH + SIMR.SIMR + "." + SIMR.OUTPUT_FILE)) {
properties.put(SIMR.SIMR + "." + SIMR.OUTPUT_FILE, args[++index]);
} else if (args[index].equals(SIMR.DASH + GSA + "." + SIMR.OUTPUT_FILE)) {
properties.put(GSA + "." + SIMR.OUTPUT_FILE, args[++index]);
} else {
throw new IllegalArgumentException(args[index] + "is an invalid argument.");
}
}
if (in == null) {
throw new IllegalArgumentException("Property file must be specified at the command line.");
}
}
/**
* Template method to generate bitext alignments.
* @return list of aligned blocks
*/
public List generateAlignedBlocks() {
List xAxisSegments = generateSegments(SIMR.X_AXIS_FILE);
List yAxisSegments = generateSegments(SIMR.Y_AXIS_FILE);
List mapPoints = generateMapPoints();
List alignedBlocks = generateAlignedBlocks(xAxisSegments, yAxisSegments, mapPoints);
backoffAlignment(alignedBlocks);
gcAlign(alignedBlocks);
return alignedBlocks;
}
/**
* Generates segments from the input file.
* @param axisFileProperty property name to look up the input file name
* @return list of segments
*/
private List generateSegments(String axisFileProperty) {
List segments = new ArrayList();
int counter = -1;
float startPosition = 0f;
float endPosition = 0f;
float length = 0f;
float prevEndPosition = 0f;
boolean blank = true;
InputFileHandler input = new InputFileHandler(properties.getProperty(axisFileProperty));
while (input.hasLine()) {
String line = input.nextLine();
StringTokenizer tokenizer = new StringTokenizer(line);
// there is not harm in using the StringTokenizer here
// since we are just looking for <EOF> marks
for (int i = 0; i < 1; i++) {
float tempEndPosition = Float.parseFloat(tokenizer.nextToken());
// is the an EOS line
if ( tokenizer.nextToken().equals(properties.getProperty(BitextSpace.EOS_MARKER)) ) {
if (blank == true) {
length = 0;
} else {
length = tempEndPosition - prevEndPosition;
};
endPosition = tempEndPosition;
counter++;
prevEndPosition = tempEndPosition;
blank = true;
Segment segment = new Segment(counter, startPosition, endPosition, length);
segments.add(segment);
} else {
blank = false;
}
} //end of for loop
startPosition = endPosition;
} //end of while loop
input.close();
return segments;
}
/**
* Generates map points based on the bitext correspondence produced by SIMR algorithm.
* @return list of map points
*/
private List generateMapPoints() {
List mapPoints = new ArrayList();
InputFileHandler input = new InputFileHandler(properties.getProperty(SIMR.SIMR + "." + SIMR.OUTPUT_FILE));
while (input.hasLine()) {
String line = input.nextLine();
StringTokenizer st = new StringTokenizer(line);
AxisTick xAxisTick = new AxisTick(-1, Float.parseFloat(st.nextToken()), null); //index and word for axis tick are negligible in gsa
AxisTick yAxisTick = new AxisTick(-1, Float.parseFloat(st.nextToken()), null);
MapPoint mapPoint = new MapPoint(xAxisTick, yAxisTick);
mapPoints.add(mapPoint);
}
return mapPoints;
}
private void hcouple(Segment hSegment) {
leftHash.put(hSegment,"true");
List ySegments = null;
ySegments = (List)xToYSegmentMap.get(hSegment);
// now for each ySegment in ySegments
// does the rightHash contain the index of the ySegment
// if not then call vcouple on the vind
for (int vind = 0; vind < ySegments.size(); vind++) {
Segment vSegment = (Segment)ySegments.get(vind);
if (!rightHash.containsKey(vSegment)) {
vcouple(vSegment);
}
}
//return;
}
private void vcouple(Segment vSegment) {
rightHash.put(vSegment,"true");
List xSegments = null;
xSegments = (List)yToXSegmentMap.get(vSegment);
// now for each ySegment in ySegments
// does the rightHash contain the index of the ySegment
// if not then call vcouple on the vind
for (int hind = 0; hind < xSegments.size(); hind++) {
Segment hSegment = (Segment)xSegments.get(hind);
if (!leftHash.containsKey(hSegment)) {
hcouple(hSegment);
}
}
//return;
}
/**
* Generates aligned blocks based on bitext correspondence.
* @param xAxisSegments list of segments on the x axis
* @param yAxisSegments list of segments on the y axis
* @param mapPoints list of map points of bitext correspondence
* @return list of aligned blocks
*/
private List generateAlignedBlocks(List xAxisSegments, List yAxisSegments,
List mapPoints) {
//the first step is to generate the hash map that records aligned segments
int xSegmentIndex = 0;
int ySegmentIndex = 0;
Iterator mapPointIter = mapPoints.iterator();
Segment xSegment = null;
Segment ySegment = (Segment)yAxisSegments.get(ySegmentIndex);
MapPoint mapPoint = null;
boolean needNewXSegment = true;
//***********************************************************************
// Generate the mapping of points --> segments
//
// yToXSegmentMap
// xToYSegmentMap
//***********************************************************************
while (( xSegmentIndex < xAxisSegments.size() -1) || ( ySegmentIndex < yAxisSegments.size() )){
// assumes that map points are sorted according to its x axis position
// so that xSegmentIndex can be monotonically incremented
if (needNewXSegment) {
xSegment = (Segment)xAxisSegments.get(xSegmentIndex);
xSegmentIndex++;
needNewXSegment = false;
}
// grab a new map point if there is one
if (mapPoint == null) {
if (mapPointIter.hasNext()) {
mapPoint = (MapPoint)mapPointIter.next();
} else {
break;
}
}
if (xSegment.contains(mapPoint, true) == 0) {
// find y segment by looking back
while (ySegment.contains(mapPoint, false) == -1) {
ySegmentIndex--;
ySegment = (Segment)yAxisSegments.get(ySegmentIndex);
}
// find y segment by look forward
while (ySegment.contains(mapPoint, false) == 1) {
ySegmentIndex++;
ySegment = (Segment)yAxisSegments.get(ySegmentIndex);
}
// x segment to y segment mapping
List ySegments = null;
if (xToYSegmentMap.containsKey(xSegment)) {
ySegments = (List)xToYSegmentMap.get(xSegment);
if (ySegments.contains(ySegment)) {
mapPoint = null;
continue;
}
// insert ySegment if it is not the last segment in the ySegments list
for (int listIndex = 0; listIndex < ySegments.size(); listIndex++) {
if (ySegment.compareTo(ySegments.get(listIndex)) == -1) {
ySegments.add(listIndex, ySegment);
break;
}
}
// append ySegment if it is the last segment in the ySegments list
if (ySegment.compareTo(ySegments.get(ySegments.size() - 1)) == 1) {
ySegments.add(ySegment);
}
} else { // ie. xToYSegmentMap.containsKey(xSegment) == false
ySegments = new LinkedList();
ySegments.add(ySegment);
}
if (xSegment.getIndex() > 0) {
xToYSegmentMap.put(xSegment, ySegments);
}
// y segment to x segment mapping
List xSegments = null;
if (yToXSegmentMap.containsKey(ySegment)) {
xSegments = (List)yToXSegmentMap.get(ySegment);
//we are here only if the ySegment is not contained in the xToYSegmentMap,
//which means that the xSegment is not contained in the yToxSegmentMap.
//so there is no need to check whether xSegment is contained in the xSegments list
// insert xSegment if it is not the last segment in the xSegments list
for (int listIndex = 0; listIndex < xSegments.size(); listIndex++) {
if (xSegment.compareTo(xSegments.get(listIndex)) == -1) {
xSegments.add(listIndex, xSegment);
break;
}
}
// append xSegment if it is the last segment in the xSegments list
if (xSegment.compareTo(xSegments.get(xSegments.size() - 1)) == 1) {
xSegments.add(xSegment);
}
} else { // ie. yToXSegmentMap.containsKey(ySegment) == false
xSegments = new LinkedList();
xSegments.add(xSegment);
}
if (ySegment.getIndex() > 0) {
yToXSegmentMap.put(ySegment, xSegments);
}
//prepare to read the next map point
mapPoint = null;
} else { // ie. xSegment.contains(mapPoint, true) == 1; because mapPoints from SIMR is sorted by x axis position, the case of xSegment.contains(mapPoint, true) == -1 does not exist
needNewXSegment = true;
}
} //end of the for loop
// todo?: Undefine the segments that have no length
/*****************************************************/
// BLOCKS
// the second step is to generate a list of continuous
// and non-overlapping aligned blocks from the two hash maps
/*****************************************************/
List alignedBlocks = new ArrayList();
xSegmentIndex = 1;
ySegmentIndex = 1;
xSegment = (Segment)xAxisSegments.get(xSegmentIndex);
ySegment = (Segment)yAxisSegments.get(ySegmentIndex);
int maxhseg = xAxisSegments.size() -1;
int maxvseg = yAxisSegments.size() -1;
AlignedBlock alignedBlock = new AlignedBlock();
int leftind = 1;
int rightind = 1;
while ( (xSegmentIndex <= maxhseg) || (ySegmentIndex <= maxvseg) ) {
List thisleft = new ArrayList();
List thisright = new ArrayList();
// first check for omitted but (non zero length) segments
while ( !xToYSegmentMap.containsKey(xSegment) && xSegmentIndex < maxhseg ) {
if (xSegment.getLength() > 0) {
alignedBlock.addSegment(xSegment, true);
}
xSegmentIndex++;
xSegment = (Segment)xAxisSegments.get(xSegmentIndex);
}
while ( !yToXSegmentMap.containsKey(ySegment) && ySegmentIndex < maxvseg) {
if (ySegment.getLength() > 0) {
alignedBlock.addSegment(ySegment, false);
}
ySegmentIndex++;
ySegment = (Segment)yAxisSegments.get(ySegmentIndex);
}
// place this new aligned block of omitted segments into
// the main list of AlignedBlocks
if ((alignedBlock.hasSegments(true)) || (alignedBlock.hasSegments(false))) {
List xSegBlock = alignedBlock.getSegments(true, true);
List ySegBlock = alignedBlock.getSegments(false, true);
alignedBlocks.add(alignedBlock.copy());
alignedBlock.clear();
continue;
}
if ((xSegmentIndex <= maxhseg) || (ySegmentIndex <= maxvseg)) {
// perform transitive closure to get the next block
// note: omitted segments above, if any, never need closure
//do we need the following statement?
//if (xToYSegmentMap.containsKey(xSegment)) {
leftHash.clear();
rightHash.clear();
//System.err.println("Perform transitive closure");
// if the alignment is recorded in the hashmap,
// generate alignment block based on the hash map info
if (xSegmentIndex <= maxhseg) {
// hcouple(xSegmentIndex)
Segment hSegment = (Segment)xAxisSegments.get(xSegmentIndex);
hcouple(hSegment);
}
if (ySegmentIndex <= maxvseg) {
// vcouple(ySegmentIndex)
Segment vSegment = (Segment)yAxisSegments.get(ySegmentIndex);
vcouple(vSegment);
}
// by adding the segments i ensure they are in sorted order
Iterator leftHashIter = leftHash.keySet().iterator();
while (leftHashIter.hasNext()) {
alignedBlock.addSegment((Segment)leftHashIter.next(), true);
}
Iterator rightHashIter = rightHash.keySet().iterator();
while (rightHashIter.hasNext()) {
alignedBlock.addSegment((Segment)rightHashIter.next(), false);
}
// add any segments necessary to make this block continuous
List thisXBlock = new LinkedList(alignedBlock.getSegments(true, false));
if (thisXBlock.size() > 0) {
int start = ((Segment)thisXBlock.get(0)).getIndex();
int finish = ((Segment)thisXBlock.get(thisXBlock.size()-1)).getIndex();
for (int i = (start); i <= finish; i++) {
// add the segment with index i to the current alignedBlock
Segment tempXSegment = (Segment)xAxisSegments.get(i);
if (tempXSegment.getLength() > 0) {
alignedBlock.addSegment(tempXSegment, true);
}
}
}
List thisYBlock = new LinkedList(alignedBlock.getSegments(false, false));
if (thisYBlock.size() > 0) {
int start = ((Segment)thisYBlock.get(0)).getIndex();
int finish = ((Segment)thisYBlock.get(thisYBlock.size()-1)).getIndex();
for (int i = (start); i <= finish; i++) {
// add the segment with index i to the current alignedBlock
Segment tempYSegment = (Segment)yAxisSegments.get(i);
if (tempYSegment.getLength() > 0) {
alignedBlock.addSegment(tempYSegment, false);
}
}
}
/**************add the block ******************************/
List xSegBlock = alignedBlock.getSegments(true, true);
List ySegBlock = alignedBlock.getSegments(false, true);
alignedBlocks.add(alignedBlock.copy());
alignedBlock.clear();
int finishx = ((Segment)thisXBlock.get(thisXBlock.size()-1)).getIndex();
int finishy = ((Segment)thisYBlock.get(thisYBlock.size()-1)).getIndex();
xSegmentIndex = finishx + 1;
if (xSegmentIndex <= maxhseg) {
xSegment = (Segment)xAxisSegments.get(xSegmentIndex);
}
ySegmentIndex = finishy + 1;
if (ySegmentIndex <= maxvseg) {
ySegment = (Segment)yAxisSegments.get(ySegmentIndex);
}
}
}
return alignedBlocks;
}
/**
* Backs off bitext alignment to generate more even alignment.
* @param alignedBlocks list of aligned blocks
*/
public void backoffAlignment(List alignedBlocks) {
for ( int index = 0; index < alignedBlocks.size(); index++) {
AlignedBlock currentAlignedBlock = (AlignedBlock)alignedBlocks.get(index);
if (currentAlignedBlock.getBalanceStatus() == 0) {
// if the alignment is even, do nothing
continue;
} else {
// if the alignment is not even, try to make it even by merging with
// its previous aligned block and/or next aligned block
boolean mergePrevious = false;
boolean mergeNext = false;
if (index > 0) {
// test whether the current aligned block can merge with its previous aligned block
AlignedBlock previousAlignedBlock = (AlignedBlock)alignedBlocks.get(index - 1);
if (currentAlignedBlock.getBalanceStatus() == -1) {
if (previousAlignedBlock.hasSegments(true)) {
double balanceScore = currentAlignedBlock.computeBalance(previousAlignedBlock, false, false); //true for down balance and false for y axis balance
if (balanceScore > Double.parseDouble(properties.getProperty(BACK_OFF))) {
mergePrevious = true;
}
}
} else { // ie. currentAlignedBlock.getBalanceStatus() == 1
if (previousAlignedBlock.hasSegments(false)) {
double balanceScore = currentAlignedBlock.computeBalance(previousAlignedBlock, false, true); //true for down balance and true for x axis blanace
if (balanceScore > Double.parseDouble(properties.getProperty(BACK_OFF))) {
mergePrevious = true;
}
}
}
}
if (index < alignedBlocks.size() - 1) {
// test whether the current aligned block can merge with its next aligned block
AlignedBlock nextAlignedBlock = (AlignedBlock)alignedBlocks.get(index + 1);
if (currentAlignedBlock.getBalanceStatus() == -1) {
if (nextAlignedBlock.hasSegments(true)) {
double balanceScore = currentAlignedBlock.computeBalance(nextAlignedBlock, true, false); //true for down balance and false for y axis balance
if (balanceScore > Double.parseDouble(properties.getProperty(BACK_OFF))) {
mergeNext = true;
}
}
} else { // ie. currentAlignedBlock.getBalanceStatus() == 1
if (nextAlignedBlock.hasSegments(false)) {
double balanceScore = currentAlignedBlock.computeBalance(nextAlignedBlock, true, true); //true for down balance and true for x axis blanace
if (balanceScore > Double.parseDouble(properties.getProperty(BACK_OFF))) {
mergeNext = true;
}
}
}
}
// merge with previous and/or next aligned block and prepare for retest
if (mergePrevious && mergeNext) {
currentAlignedBlock.merge((AlignedBlock)alignedBlocks.get(index - 1));
currentAlignedBlock.merge((AlignedBlock)alignedBlocks.get(index + 1));
alignedBlocks.remove(index + 1);
alignedBlocks.remove(index - 1);
index -= 2;
continue;
}
if (mergePrevious) {
currentAlignedBlock.merge((AlignedBlock)alignedBlocks.get(index - 1));
alignedBlocks.remove(index - 1);
index -= 2;
continue;
}
if (mergeNext) {
currentAlignedBlock.merge((AlignedBlock)alignedBlocks.get(index + 1));
alignedBlocks.remove(index + 1);
index--;
continue;
}
}
} // end of for loop
}
/**
* Reprocesses aligned blocks by consulting the gcalign algorithm.
* @param alignedBlocks list of aligned blocks
*/
public void gcAlign(List alignedBlocks) {
for (int index = 0; index < alignedBlocks.size(); index++) {
AlignedBlock alignedBlock = (AlignedBlock)alignedBlocks.get(index);
List xSegments = alignedBlock.getSegments(true, false);
List ySegments = alignedBlock.getSegments(false, false);
if (xSegments.size() > 1 && ySegments.size() > 1) {
//only consult gcalign algorithm when the alignment involves more than 2 segments
GCalign gcAlign = new GCalign();
String[] params = new String[4];
params[0] = "-d";
params[1] = "<BRK>";
params[2] = "-i";
StringBuffer param = new StringBuffer();
boolean isFirst = true;
Iterator xSegmentIter = xSegments.iterator();
while (xSegmentIter.hasNext()) {
if (isFirst) {
isFirst = false;
} else {
param.append(",");
}
param.append(((Segment)xSegmentIter.next()).getDistance());
}
param.append(",").append(params[1]);
Iterator ySegmentIter = ySegments.iterator();
while (ySegmentIter.hasNext()) {
param.append(",");
param.append(((Segment)ySegmentIter.next()).getDistance());
}
params[3] = param.toString();
List alignmentStrings = gcAlign.gcalign(params);
//insert new aligned blocks and remove the old aligned block
int xStartIndex = 0;
int yStartIndex = 0;
Iterator alignmentIter = alignmentStrings.iterator();
while (alignmentIter.hasNext()) {
String alignmentString = (String)alignmentIter.next();
StringTokenizer st = new StringTokenizer(alignmentString);
int xSegmentNumber = Integer.parseInt(st.nextToken());
st.nextToken();
int ySegmentNumber = Integer.parseInt(st.nextToken());
AlignedBlock newAlignedBlock = new AlignedBlock();
for (int xIndex = xStartIndex; xIndex < xStartIndex + xSegmentNumber; xIndex++) {
newAlignedBlock.addSegment((Segment)xSegments.get(xIndex), true);
}
for (int yIndex = yStartIndex; yIndex < yStartIndex + ySegmentNumber; yIndex++) {
newAlignedBlock.addSegment((Segment)ySegments.get(yIndex), false);
}
alignedBlocks.add(index, newAlignedBlock);
xStartIndex += xSegmentNumber;
yStartIndex += ySegmentNumber;
index++;
}
alignedBlocks.remove(index);
index--;
}
} // end of for loop
}
/**
* Prints out alignments.
* @param alignedBlocks list of aligned blocks
*/
public void printAlignedBlocks(List alignedBlocks) {
Iterator iterator = alignedBlocks.iterator();
String gsaOut = properties.getProperty(GSA + "." + SIMR.OUTPUT_FILE);
if (gsaOut == null) {
while (iterator.hasNext()) {
System.err.println(((AlignedBlock)iterator.next()).toString());
}
return;
}
OutputFileHandler out = new OutputFileHandler(properties.getProperty(GSA + "." + SIMR.OUTPUT_FILE));
while (iterator.hasNext()) {
out.write(((AlignedBlock)iterator.next()).toString() + "\n");
}
out.close();
}
/**
* Main method.
* @param args command line arguments
*/
public static void main (String[] args) {
GSA gsa = new GSA(args);
List alignedBlocks = gsa.generateAlignedBlocks();
gsa.printAlignedBlocks(alignedBlocks);
System.exit(0);
}
}