/* * Clusters.java * * ... a place to collect all necessary information * about matches for the elements under consideration, * be they * anchor word matches, * proper names matches, * dice matches, * all of the three, i.e, all the word based methods, * or special character matches * ... * @author Oystein Reigem */ package aksis.alignment; import java.util.*; /** * single word reference in a Cluster in Clusters. * what about phrases ############ */ class Ref { // a reference to word pos in the considered elements of text t // type of match. // if the value is non-negative the match is an anchor word match, // and the value is the number of the matching entry in the anchor word list. // if the value is negative the match is some other match - // proper name match, Dice match, etc, depending on the value // (Match.PROPER, Match.DICE, etc) private int matchType; // 2006-04-05 // weight private float weight; // 2006-04-05 // text number private int t; // number of the element where the word occurs private int elementNumber; // 2006-04-05 //// word number (position) within the relevant elements of that text // the word's position within its element // 2006-04-05 // if it's a phrase: the position of the _first_ word in the phrase // 2006-04-07 private int pos; // 2006-04-07 // length of phrase // 2006-04-07 private int len; // word as written in the text. // if it's a phrase: words separated with space // 2006-04-07 private String word; //Ref(int t, int pos1) { //Ref(int t, int pos1, String word) { //Ref(int matchType, float weight, int t, int elementNumber, int pos, String word) { // 2006-04-05 Ref(int matchType, float weight, int t, int elementNumber, int pos, int len, String word) { // 2006-04-07 this.matchType = matchType; // 2006-04-05 this.weight = weight; // 2006-04-05 this.t = t; this.elementNumber = elementNumber; // 2006-04-05 this.pos = pos; this.len = len; // 2006-04-07 this.word = word; } // checks if equal content ### øh boolean matches(Ref otherRef) { //2006-04-05 // (not necessary to check match type or word) //return ((this.t == otherRef.t) && (this.pos == otherRef.pos)); //return ((this.t == otherRef.t) && (this.elementNumber == otherRef.elementNumber) && (this.pos == otherRef.pos)); // 2006-04-05 //return ((this.matchType == otherRef.matchType) && (this.t == otherRef.t) && (this.elementNumber == otherRef.elementNumber) && (this.pos == otherRef.pos)); // 2006-04-05 // 2006-04-06 //System.out.println("\n"); //System.out.println("enter Ref.matches(otherRef). this ref = " + this + ". otherRef = " + otherRef); //if ((this.t == otherRef.t) && (this.elementNumber == otherRef.elementNumber) && (this.pos == otherRef.pos)) { if ((this.t == otherRef.t) && (this.elementNumber == otherRef.elementNumber) && (Utils.overlaps(this.pos, this.len, otherRef.pos, otherRef.len))) { // 2006-04-07 // exactly the same word occurrence. // if phrase(s): at least one of the words in the phrases is exactly the same word occurrence. // 2006-04-07 // (this is the first possible kind of match) //System.out.println("exactly the same word occurrence"); //System.out.println("\n"); return true; } else { if (otherRef.matchType >= 0) { // the other ref is an anchor word ref if (this.matchType == otherRef.matchType) { // both are anchor words that belong to the same anchor word list entry. // (this is the second and last possible kind of match) //System.out.println("both are anchor words that belong to the same anchor word list entry"); //System.out.println("\n"); return true; } } } //System.out.println("matches not"); //System.out.println("\n"); return false; // end 2006-04-06 } // 2006-04-06 // vet ikke om bruk for denne boolean exactlyMatches(Ref otherRef) { // ####hva med weight? //return ((this.matchType == otherRef.matchType) && (this.t == otherRef.t) && (this.elementNumber == otherRef.elementNumber) && (this.pos == otherRef.pos)); //return ((this.matchType == otherRef.matchType) && (this.t == otherRef.t) && (this.elementNumber == otherRef.elementNumber) && (Utils.overlaps(this.pos, this.len, otherRef.pos, otherRef.len))); // 2006-04-07 return ((this.matchType == otherRef.matchType) && (this.t == otherRef.t) && (this.elementNumber == otherRef.elementNumber) && (this.pos == otherRef.pos) && (this.len == otherRef.len)); // 2006-04-07 } // 2006-04-05 String getWord() { return word; } int getT() { return t; } int getMatchType() { return matchType; } // end 2006-04-05 // for debugging or testing purposes. // the number of refs in the cluster public boolean typeAnchorWord() { return (matchType >= 0); } float getPos() { return pos; } // 2006-04-07 float getWeight() { return weight; } boolean isInText(int t) { return (this.t == t); } // for debugging purposes public String toString() { return "[" + "matchType=" + matchType + ";weight=" + weight + ";t=" + t + ";elementNumber=" + elementNumber + ";pos=" + pos + ";len=" + len + ";word=" + word + "]"; // 2006-04-05 } } /** * single cluster in Clusters. * list of all word references belonging to the cluster. * 2006-04-06 * a cluster collects word occurrences that are related. * e.g, two words from _either_ text, matching the same anchor word entry, are related. * e.g, two words from the _same_ text, matching the same anchor word entry, are related. * the first case is a real case of matching words, * the other one not. * the calling methods are responsible for not adding references that do not represent matches */ class Cluster implements Cloneable { List refs; // list of Ref //int clusterScoreMethod; //Cluster(int clusterScoreMethod) { Cluster() { // 2006-04-05 refs = new ArrayList(); // this.clusterScoreMethod = clusterScoreMethod; } public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e) { throw new Error("This should not occur since we implement Cloneable"); } } List getRefs() { // 2006-04-05 return refs; } void add(Ref otherRef) { //System.out.println(">>> >>> >>> enter Cluster.add(otherRef). content = " + this + ". otherRef = " + otherRef); Iterator rIt = refs.iterator(); Ref ref; while (rIt.hasNext()) { ref = (Ref)rIt.next(); //if (ref.matches(otherRef)) { // 2006-04-06 TULL!! if (ref.exactlyMatches(otherRef)) { // 2006-04-06 vet ikke om det noensinne vil bli eksakt match //System.out.println(">>> >>> >>> Cluster.add(otherRef) forts. the cluster already contains this reference"); // the cluster already contains this reference // (necessarily with the same word) return; } } //System.out.println(">>> >>> >>> Cluster.add(otherRef) forts. this reference is new. add it to the cluster"); // this reference is new. // add it to the cluster refs.add(otherRef); //System.out.println(">>> >>> >>> exit Cluster.add(otherRef). content = " + this); return; } boolean matches(Ref otherRef) { // does this cluster match other ref? // //System.out.println("enter Cluster.matches(otherRef). content = " + this + ". otherRef = " + otherRef); Iterator rIt = refs.iterator(); Ref ref; while (rIt.hasNext()) { ref = (Ref)rIt.next(); //System.out.println("..... Cluster.matches(otherRef). sammenlikner med ref = " + ref); if (ref.matches(otherRef)) { //System.out.println("exit Cluster.matches(otherRef). match!"); return true; } } //System.out.println("exit Cluster.matches(otherRef). ikke match"); return false; } boolean matches(Cluster otherCluster) { Iterator orIt = otherCluster.refs.iterator(); Ref otherRef; while (orIt.hasNext()) { otherRef = (Ref)orIt.next(); if (this.matches(otherRef)) { return true; } } return false; } void add(Cluster otherCluster) { //System.out.println(">>> >>> enter Cluster.add(Cluster). content = " + this); Iterator orIt = otherCluster.refs.iterator(); Ref otherRef; while (orIt.hasNext()) { otherRef = (Ref)orIt.next(); //System.out.println(">>> >>> skal adde otherRef = " + otherRef); this.add(otherRef); } //System.out.println(">>> >>> leave Cluster.add(Cluster). content = " + this); } // a cluster contains references to words in several text (in practice two texts). // some texts may have more references than others, // and this method finds the largest number of references //int getScore() { //int getScore(int clusterScoreMethod) { // 2006-04-05 //float getScore(int clusterScoreMethod) { // 2006-04-07 float getScore(int largeClusterScorePercentage) { //System.out.println("enter Cluster.getScore(). clusterScoreMethod = " + clusterScoreMethod + ", Cluster = " + this); //System.out.println("enter Cluster.getScore(). largeClusterScorePercentage = " + largeClusterScorePercentage + ", Cluster = " + this); //if (this.size() > 2) { // System.out.println("enter Cluster.getScore(). Cluster = " + this); //} /* // 2006-04-07 if (clusterScoreMethod == 1) { // each cluster scores exactly 1 return 1; } */ /* ////////// test ////////////// if (true) { //System.out.println("exit Cluster.getScore()"); return 1; } */ int high = 0; int low = Integer.MAX_VALUE; Iterator rIt; Ref ref; float clusterWeight = 0.0f; // 2006-04-07 for (int t=0; t positions = new ArrayList(); // list to collect unique positions rIt = refs.iterator(); while (rIt.hasNext()) { ref = (Ref)rIt.next(); //System.out.println("ref = " + ref); if (ref.isInText(t)) { //System.out.println("ref er i tekst " + t); //count++; // ########## teller for mye. skal bare telle antall forskjellige posisjoner, ikke antall enkelt-referanser if (!positions.contains(ref.getPos())) { positions.add(ref.getPos()); } clusterWeight = Math.max(clusterWeight, ref.getWeight()); // 2006-04-07 } count = positions.size(); } /* if (clusterScoreMethod == 2) { // each cluster scores equal to its minimal dimension. // e.g if each of 2 words in one text match each of 3 words in the other text, // X X X // | | | // X-+-+-+ // | | | // X-+-+-+ // the score is min(2, 3) = 2. // some clusters may not be regular grids, e.g // X X X X // | | | | // X-+-+ | | // | | | | // X-+-+-+ | // | | // X-----+-+ // in the latter example the minimal dimension is 3 low = Math.min(low, count); } else { // if (clusterScoreMethod == 3) // each cluster scores equal to its maximal dimension. high = Math.max(high, count); } */ low = Math.min(low, count); high = Math.max(high, count); } /* if (clusterScoreMethod == 1) { // 2006-04-07 //return 1; return clusterWeight; // 2006-04-07 } else if (clusterScoreMethod == 2) { //return low; return clusterWeight * low; // 2006-04-07 } else { // if (clusterScoreMethod == 3) //return high; return clusterWeight * high; // 2006-04-07 } */ //System.out.println("low = " + low); return clusterWeight * ( 1 + ((low - 1) * largeClusterScorePercentage / 100.0f) ); } //String getWords() { List getWords(boolean includeMatchType) { // List of String // 2006-04-05 //System.out.println("¤¤¤ ¤¤¤ enter getWords()"); // includeMatchType ### ugly ### + 1 // 2006-04-05 // må kunne returnere en hel liste av String, // for det kan være flere matchType-r i clusteret, // i praksis match-er for mer enn én ankerordsentry Cluster sortedCluster = (Cluster)this.clone(); // sort on (1) match type (in case it contains anchor word entry number), (2) text number, (3) word // 2006-04-05 Collections.sort(sortedCluster.refs, new RefComparator()); Iterator rIt = sortedCluster.refs.iterator(); List ret = new ArrayList(); // the list to return // 2006-04-05 String retLine = ""; // one item in the list int prevMatchType = Integer.MIN_VALUE; // 2006-04-05 int prevT = -1; int numSlashes; //boolean first = true; boolean firstInCurrMatchType; // 2006-04-05 boolean firstInCurrText; while (rIt.hasNext()) { Ref ref = (Ref)rIt.next(); //System.out.println("¤¤¤ ¤¤¤ getWords() forts. ref = " + ref); // 2006-04-05 firstInCurrMatchType = (ref.getMatchType() > prevMatchType); //System.out.println("¤¤¤ ¤¤¤ firstInCurrMatchType = " + firstInCurrMatchType); if (firstInCurrMatchType) { // starting on a new match type, // i.e, starting on a new item in the list to return // add the current item - if any - to the list before starting anew if (retLine != "") { ret.add(retLine); // reset retLine = ""; prevT = -1; } // ... if (includeMatchType) { // ### ugly ### int matchType = ref.getMatchType(); int temp = matchType + 1; // ### +1 because we want human numbering of anchor word entries // ugly because in principle there could be a value for something different than the number of a anchor word entry retLine += temp + " "; } } // end 2006-04-05 firstInCurrText = (ref.getT() > prevT); // 2006-04-05 //System.out.println("¤¤¤ ¤¤¤ firstInCurrText = " + firstInCurrText); if (firstInCurrText) { if (firstInCurrMatchType) { // 2006-04-05 numSlashes = ref.getT() - prevT -1; // 2006-04-05 firstInCurrMatchType = false; // 2006-04-05 } else { numSlashes = ref.getT() - prevT; // 2006-04-05 } for (int i=0; i 2) { //if (countAnchordWordRefs() > 2) { if (countAnchordWordEntries() > 2) { Iterator rIt = getRefs().iterator(); Ref ref; boolean first = true; while (rIt.hasNext()) { ref = (Ref)rIt.next(); if (first) { first = false; } else { retVal += "\n"; } retVal += ref.toString(); } retVal += "\n"; } if (retVal != "") { retVal = "non-trivial cluster:\n\n" + retVal; } return retVal; } } /** * information about how related words in the texts collect in clusters */ class Clusters { List clusters; // list of Cluster //int clusterScoreMethod; // 2006-04-05 //Clusters(int clusterScoreMethod) { Clusters() { // 2006-04-05 clusters = new ArrayList(); //this.clusterScoreMethod = clusterScoreMethod; // 2006-04-05 } // 2006-04-05 // word pos1 in element elementNumber1 in text t (word1) is related to // word pos2 in element elementNumber2 in text tt (word2). // matchType is the type of match, as explained elsewhere. // weight is the weight assigned to the match. // add that information //void add(int t, int u, int pos1, int pos2) { //void add(int t, int u, int pos1, int pos2, String word1, String word2) { //void add(int matchType, float weight, int t, int tt, int elementNumber1, int elementNumber2, int pos1, int pos2, String word1, String word2) { // 2006-04-05 void add(int matchType, float weight, int t, int tt, int elementNumber1, int elementNumber2, int pos1, int pos2, int len1, int len2, String word1, String word2) { // 2006-04-05 //System.out.println("enter Clusters.add(matchType= " + matchType + ", weight=" + weight + ", t=" + t + ", tt=" + tt + ", pos1=" + pos1 + ", pos2=" + pos2 + "). content = " + this); // make a new Cluster from the new word references. // add the new Cluster. // 2006-04-06 // important that the two Ref's are added as a Cluster and not as single Ref's, // because the Ref.add(Ref) method would not be able to cluster them, // because the ref.matches(Ref) method would not recognize them as related //Cluster newCluster = new Cluster(clusterScoreMethod); //newCluster.add(new Ref(t, pos1)); //newCluster.add(new Ref(u, pos2)); //newCluster.add(new Ref(t, pos1, word1)); //newCluster.add(new Ref(u, pos2, word2)); //add(new Ref(matchType, weight, t, elementNumber1, pos1, word1)); // 2006-04-05 //add(new Ref(matchType, weight, tt, elementNumber2, pos2, word2)); // 2006-04-05 // 2006-04-06 Cluster newCluster = new Cluster(); //newCluster.add(new Ref(matchType, weight, t, elementNumber1, pos1, word1)); //newCluster.add(new Ref(matchType, weight, tt, elementNumber2, pos2, word2)); newCluster.add(new Ref(matchType, weight, t, elementNumber1, pos1, len1, word1)); // 2006-04-07 newCluster.add(new Ref(matchType, weight, tt, elementNumber2, pos2, len2, word2)); // 2006-04-07 add(newCluster); // end 2006-04-06 } // 2006-04-05 void add(Ref ref) { // 2006-04-05 //// make a new cluster from the new word reference // 2006-04-05 //System.out.println(">>> enter Clusters.add(). skal legge ny ref til clusters. ref = " + ref); //System.out.println(">>> Clusters content = " + this); //Cluster newCluster = new Cluster(); // 2006-04-05 //newCluster.add(ref); // 2006-04-05 ////System.out.println("lager foreløpig et nytt cluster = " + newCluster); //// check the new cluster against all existing clusters to see if there is overlap // check the ref against all existing clusters to see if there is overlap List overlaps = new ArrayList(); // list of Cluster Iterator cIt = clusters.iterator(); Cluster cluster; while (cIt.hasNext()) { cluster = (Cluster)cIt.next(); ////System.out.println("sammenlikner nye cluster mot cluster = " + cluster); //System.out.println(">>> sammenlikner ref mot cluster = " + cluster); //if (newCluster.matches(cluster)) { if (cluster.matches(ref)) { ////System.out.println("nye cluster matcher cluster"); //System.out.println(">>> ref matcher cluster. overlaps øker"); overlaps.add(cluster); } } //// merge the new cluster with all overlapping clusters // merge the ref with all overlapping clusters. // first make a cluster from the ref //System.out.println(">>> merge the ref with all overlapping clusters"); Cluster mergedCluster = new Cluster(); // 2006-04-05 mergedCluster.add(ref); // 2006-04-05 //System.out.println(">>> init mergedCluster = " + mergedCluster); Iterator oIt = overlaps.iterator(); while (oIt.hasNext()) { //System.out.println(">>> merge"); cluster = (Cluster)oIt.next(); mergedCluster.add(cluster); //System.out.println(">>> mergedCluster = " + mergedCluster); clusters.remove(cluster); } // add the new, merged cluster //System.out.println(">>> add the new, merged cluster"); clusters.add(mergedCluster); //System.out.println(">>> leave Clusters.add(). content = " + this); } // 2006-04-05 // merge two Clusters void add(Clusters otherClusters) { Iterator ocIt = otherClusters.clusters.iterator(); while (ocIt.hasNext()) { Cluster otherCluster = (Cluster)ocIt.next(); add(otherCluster); } } // add a Cluster to a Clusters void add(Cluster otherCluster) { /* 2006-04-06 ### dette splitter lett opp igjen otherCluster i de enkelte Ref. må ha en metode som likner add(Ref) Iterator orIt = otherCluster.getRefs().iterator(); while (orIt.hasNext()) { Ref otherRef = (Ref)orIt.next(); add(otherRef); } */ // 2006-04-06 // check the other cluster against all existing clusters to see if there is overlap List overlaps = new ArrayList(); // list of Cluster Iterator cIt = clusters.iterator(); Cluster cluster; while (cIt.hasNext()) { cluster = (Cluster)cIt.next(); Iterator orIt = otherCluster.refs.iterator(); boolean match = false; while (orIt.hasNext()) { Ref otherRef = (Ref)orIt.next(); if (cluster.matches(otherRef)) { match = true; break; } } if (match) { // one ref in the other cluser matched, and therefore the whole cluster matched overlaps.add(cluster); } } // clone the other cluster and merge it with all overlapping clusters Cluster mergedCluster = (Cluster)otherCluster.clone(); Iterator oIt = overlaps.iterator(); while (oIt.hasNext()) { cluster = (Cluster)oIt.next(); mergedCluster.add(cluster); clusters.remove(cluster); } // add the new, merged cluster clusters.add(mergedCluster); // end 2006-04-06 } // end 2006-04-05 //int getScore() { //int getScore(int clusterScoreMethod) { // 2006-04-05 float getScore(int clusterScoreMethod) { // 2006-04-07 //System.out.println("enter Clusters.getScore(). content = " + this); //System.out.println("enter Clusters.getScore()"); //int score = 0; float score = 0.0f; // 2006-04-07 // each cluster adds to the score. // each cluster contains references to words in several text (in practice 2). // the score is the largest number of referred words in any text Iterator cIt = clusters.iterator(); Cluster cluster; while (cIt.hasNext()) { cluster = (Cluster)cIt.next(); //System.out.println("Clusters.getScore(). next cluster: " + cluster); //score += cluster.getScore(); score += cluster.getScore(clusterScoreMethod); //System.out.println("Clusters.getScore() forts. score økt til " + score); } //System.out.println("exit Clusters.getScore()"); return score; } //String getWords() { // ¤¤¤ or rather get details about matching words //List getDetails() { // includeMatchType ### ugly ### see Cluster.getWords(includeMatchType) List getDetails(int indentLevel, boolean includeMatchType) { // 2006-04-05 //System.out.println("\n¤¤¤ enter getDetails(). Clusters = " + this); Iterator cIt = clusters.iterator(); Cluster cluster; //String ret = ""; List ret = new ArrayList(); //boolean first = true; while (cIt.hasNext()) { //if (first) { // first = false; //} else { // ret += "\n"; //} cluster = (Cluster)cIt.next(); //System.out.println("¤¤¤ next cluster"); //ret += ElementInfoToBeCompared.INDENT + ElementInfoToBeCompared.INDENT + cluster.getWords(); // ¤¤¤¤¤¤¤¤¤¤ ElementInfoToBeCompared ¤¤¤ INDENT //ret += ElementInfoToBeCompared.INDENT + ElementInfoToBeCompared.INDENT + cluster.getWords() + "\n"; // ¤¤¤¤¤¤¤¤¤¤ ElementInfoToBeCompared ¤¤¤ INDENT //ret.add(ElementInfoToBeCompared.INDENT + ElementInfoToBeCompared.INDENT + cluster.getWords()); // ¤¤¤¤¤¤¤¤¤¤ ElementInfoToBeCompared ¤¤¤ INDENT // 2006-04-05 String indentation = ""; for (int i = 0; i < indentLevel; i++) { indentation += ElementInfoToBeCompared.INDENT; } // ¤¤¤¤¤¤¤¤¤¤ ElementInfoToBeCompared ¤¤¤ INDENT List lines = cluster.getWords(includeMatchType); for (int j = 0; j < lines.size(); j++) { //System.out.println("¤¤¤ next line"); ret.add(indentation + lines.get(j)); } // end 2006-04-05 } //System.out.println("¤¤¤ exit getDetails()\n"); return ret; } // for debugging purposes public String toString() { Iterator cIt = clusters.iterator(); Cluster cluster; String retVal = ""; boolean first = true; retVal += "{ "; while (cIt.hasNext()) { cluster = (Cluster)cIt.next(); if (first) { first = false; } else { retVal += " ; "; } retVal += cluster.toString(); } retVal += " }"; return retVal; } // for debugging or testing purposes public String nonTrivialClusters_ToString() { Iterator cIt = clusters.iterator(); Cluster cluster; String retVal = ""; String temp; boolean first = true; while (cIt.hasNext()) { cluster = (Cluster)cIt.next(); temp = cluster.nonTrivialCluster_ToString(); if (temp != "") { if (first) { first = false; } else { retVal += "\n"; } retVal += temp; } } if (retVal != "") { retVal = "(This is not an error message)\nCONTAINS NON-TRIVIAL CLUSTERS:\n\n" + retVal; } return retVal; } }