/* * SimilarityUtils.java * * ... * ... * ... */ package aksis.alignment; // import java.util.*; //import java.lang.Math.*; import java.util.regex.*; class SimilarityUtils { //public static float dice(String word1, String word2) { public static boolean diceMatch(String word1, String word2, float diceMinCountingScore) { // 2006-08-09 // €€€ this function does case sensitive comparisons // ### not anymore // ... //System.out.println("ankommer dice(). word1=" + word1 + " word2=" + word2); int i; String word1LowerCase = word1.toLowerCase(); String word2LowerCase = word2.toLowerCase(); // find all unique bigrams in first word Hashtable uniqueBigrams1 = new Hashtable(); int countBigrams1; for (i = 0; i < word1LowerCase.length() - 1; i++) { uniqueBigrams1.put(word1LowerCase.substring(i, i+2), new Integer(1)); } countBigrams1 = uniqueBigrams1.size(); // find all unique bigrams in second word Hashtable uniqueBigrams2 = new Hashtable(); int countBigrams2; for (i = 0; i < word2LowerCase.length() - 1; i++) { uniqueBigrams2.put(word2LowerCase.substring(i, i+2), new Integer(1)); } countBigrams2 = uniqueBigrams2.size(); // count all shared bigrams ////Hashtable sharedBigrams = new Hashtable(); int countSharedBigrams = 0; for (Enumeration e = uniqueBigrams1.keys(); e.hasMoreElements();) { if (uniqueBigrams2.containsKey(e.nextElement())) { countSharedBigrams++; } } //System.out.println("dice() forts. countBigrams=" + countBigrams1 + " countBigrams2=" + countBigrams2 + " countSharedBigrams=" + countSharedBigrams); // calculate dice score float diceScore; if ((countBigrams1 == 0) || (countBigrams2 == 0)) { diceScore = 0; } else { ////diceScore = countSharedBigrams / (countBigrams1 + countBigrams2); ????????????? diceScore = (2 * (float)countSharedBigrams) / (countBigrams1 + countBigrams2); // ????????????? } //System.out.println("dice() forts. diceScore=" + diceScore); // // decide if scores are high enough // 2006-08-09 //return diceScore; return (diceScore >= diceMinCountingScore); // 2006-08-09 } //#######den er jo helt feil. den ser ikke på hvordan hvert av ordene i frasen matcher. //#######aha. kanskje jeg tenkte at kallende rutine skulle gjøre dette, men glemte å ordne det der? //// 2006-04-18 // //// ### burde disse (metoden over og denne metoden) // //// vært slått sammen? eller kode skilt ut i felles metoder? // //public static float dice(String word1, String word2, String type) { // // String phrase; // String word; // if (type == "2-1") { // phrase = word1; // word = word2; // } else if (type == "1-2") { // word = word1; // phrase = word2; // } else { // // ### program error // return -1.0f; // } // //System.out.print("phrase=" + phrase); // //System.out.print("word=" + word); // // int i; // String wordLowerCase = word.toLowerCase(); // String phraseLowerCase = phrase.toLowerCase(); // // // find all unique bigrams in word // // Hashtable uniqueBigramsInWord = new Hashtable(); // int countBigramsInWord; // for (i = 0; i < wordLowerCase.length() - 1; i++) { // uniqueBigramsInWord.put(wordLowerCase.substring(i, i+2), new Integer(1)); // } // countBigramsInWord = uniqueBigramsInWord.size(); // // // find all unique bigrams in phrase // // Hashtable uniqueBigramsInPhrase = new Hashtable(); // int countBigramsInPhrase; // for (i = 0; i < phraseLowerCase.length() - 1; i++) { // uniqueBigramsInPhrase.put(phraseLowerCase.substring(i, i+2), new Integer(1)); // } // countBigramsInPhrase = uniqueBigramsInPhrase.size(); // // // count all shared bigrams // // int countSharedBigrams = 0; // for (Enumeration e = uniqueBigramsInWord.keys(); e.hasMoreElements();) { // if (uniqueBigramsInPhrase.containsKey(e.nextElement())) { // countSharedBigrams++; // } // } // // // calculate dice score // // float diceScore; // if ((countBigramsInWord == 0) || (countBigramsInPhrase == 0)) { // diceScore = 0; // } else { // diceScore = (float)countSharedBigrams / countBigramsInWord; // ????????????? ############## // // which is the same as // // diceScore = (2 * (float)countSharedBigrams) / (countBigramsInWord * 2); // // where we sort of pretend the phrase is of the same length as the word // } // // return diceScore; // //} //// end 2006-04-18 // 2006-08-09 public static boolean diceMatch(String wordA, String wordB, String wordC, String type, float diceMinCountingScore) { // if type = "2-1" the phrase wordA + wordB is to be compared with the word wordC . // if type = "1-2" the word wordA is to be compared with the phrase wordB + wordC. String phraseWord1, phraseWord2; String word; String phrase; if (type == "2-1") { phraseWord1 = wordA; phraseWord2 = wordB; word = wordC; } else if (type == "1-2") { word = wordA; phraseWord1 = wordB; phraseWord2 = wordC; } else { // ### program error //return -1.0f; return false; // 2006-08-09 } //System.out.print("phraseWord1=" + phraseWord1); //System.out.print("phraseWord2=" + phraseWord2); //System.out.print("word=" + word ); phrase = phraseWord1 + phraseWord2; int i; String wordLowerCase = word.toLowerCase(); String phraseWord1LowerCase = phraseWord1.toLowerCase(); String phraseWord2LowerCase = phraseWord2.toLowerCase(); String phraseLowerCase = phraseWord1LowerCase + phraseWord2LowerCase; // find all unique bigrams in word Hashtable uniqueBigramsInWord = new Hashtable(); int countBigramsInWord; for (i = 0; i < wordLowerCase.length() - 1; i++) { uniqueBigramsInWord.put(wordLowerCase.substring(i, i+2), new Integer(1)); } countBigramsInWord = uniqueBigramsInWord.size(); // find all unique bigrams in phrase's 1st word Hashtable uniqueBigramsInPhraseWord1 = new Hashtable(); int countBigramsInPhraseWord1; for (i = 0; i < phraseWord1LowerCase.length() - 1; i++) { uniqueBigramsInPhraseWord1.put(phraseWord1LowerCase.substring(i, i+2), new Integer(1)); } countBigramsInPhraseWord1 = uniqueBigramsInPhraseWord1.size(); // count all shared bigrams int countSharedBigrams1 = 0; for (Enumeration e = uniqueBigramsInWord.keys(); e.hasMoreElements();) { if (uniqueBigramsInPhraseWord1.containsKey(e.nextElement())) { countSharedBigrams1++; } } // find all unique bigrams in phrase's 2nd word Hashtable uniqueBigramsInPhraseWord2 = new Hashtable(); int countBigramsInPhraseWord2; for (i = 0; i < phraseWord2LowerCase.length() - 1; i++) { uniqueBigramsInPhraseWord2.put(phraseWord2LowerCase.substring(i, i+2), new Integer(1)); } countBigramsInPhraseWord2 = uniqueBigramsInPhraseWord2.size(); // count all shared bigrams int countSharedBigrams2 = 0; for (Enumeration e = uniqueBigramsInWord.keys(); e.hasMoreElements();) { if (uniqueBigramsInPhraseWord2.containsKey(e.nextElement())) { countSharedBigrams2++; } } // calculate dice scores float diceScore1; if ((countBigramsInWord == 0) || (countBigramsInPhraseWord1 == 0)) { diceScore1 = 0; } else { diceScore1 = (float)countSharedBigrams1 / countBigramsInPhraseWord1; // ????????????? ############## // which is the same as // diceScore = (2 * (float)countSharedBigrams1) / (countBigramsInPhraseWord1 * 2); // where we sort of pretend the phrase is of the same length as the word } float diceScore2; if ((countBigramsInWord == 0) || (countBigramsInPhraseWord2 == 0)) { diceScore2 = 0; } else { diceScore2 = (float)countSharedBigrams2 / countBigramsInPhraseWord2; // ????????????? ############## // which is the same as // diceScore = (2 * (float)countSharedBigrams2) / (countBigramsInPhraseWord2 * 2); // where we sort of pretend the phrase is of the same length as the word } // decide if scores are high enough return ((diceScore1 >= diceMinCountingScore) && (diceScore2 >= diceMinCountingScore)); } // end 2006-08-09 public static boolean anchorMatch(String anchorWord, String word) { // is the word word an occurrence of the anchor word anchorWord? // anchorWord may contain wildcard characters: '*' = any string. // anchorWord is not "pre-compiled" into a regular expression. // perhaps it should have been // make a proper regular expression from the anchor word String pattern = "^" + anchorWord.replaceAll("[*]", ".*") + "$"; // .replaceAll(regexp, string) - fikk ikke til å bruke "\*" //System.out.print("pattern = " + pattern + "\n"); //if (word.matches(pattern)) { // System.out.print(">>>pattern " + pattern + " matcher " + word + "\n"); //} else { // System.out.print(">>>pattern " + pattern + " matcher ikke " + word + "\n"); //} //return word.matches(pattern); Pattern compiledPattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE); Matcher matcher = compiledPattern.matcher(word); /*if (matcher.matches()) { System.out.print(">>>pattern " + pattern + " matcher " + word + "\n"); } else { System.out.print(">>>pattern " + pattern + " matcher ikke " + word + "\n"); }*/ return matcher.matches(); } // 2006-09-20 public static boolean badLengthCorrelation(int length1, int length2, int elementCount1, int elementCount2, float ratio) { // some dupl code adjustForLengthCorrelation float killLimit = 0.5f; // less tolerant limit for 1-2 and 2-1, above which such alignments score lethally low float c = (float)(2 * Math.abs(0.0f + ratio*length1 - length2) / (ratio*length1 + length2)); return (((elementCount1 > 0) && (elementCount2 > 0) && (elementCount1 != elementCount2)) && (c > killLimit)); } // end 2006-09-20 //public static float adjustForLengthCorrelation(float score, int length1, int length2) { //public static float adjustForLengthCorrelation(float score, int length1, int length2, float ratio) { public static float adjustForLengthCorrelation(float score, int length1, int length2, int elementCount1, int elementCount2, float ratio) { // two units are compared, each consisting of one (???????????????) or more elements. // this methods calculates how well the lengths of the two compare, // and adjusts the score // ### formelen er litt merkelig. bør forbedres float newScore = 0.0f; float lowerLimit = 0.4f; float upperLimit = 1.0f; float killLimit = 0.5f; // less tolerant limit for 1-2 and 2-1, above which such alignments score lethally low //float c = (float)(2 * Math.abs(0.0f + length1 - length2) / (length1 + length2)); float c = (float)(2 * Math.abs(0.0f + ratio*length1 - length2) / (ratio*length1 + length2)); if ((elementCount1 > 0) && (elementCount2 > 0) && (elementCount1 != elementCount2)) { // 1-2 or 2-1. // be less tolerant of length differences if (c < lowerLimit/2) { newScore = score + 2; } else if (c < lowerLimit) { newScore = score + 1; } else if (c > killLimit) { // 'less tolerant' might be an understatement. // actually kill this path altogether by setting a very low score //newScore = -Float.MAX_VALUE; //newScore = -99999.0f; newScore = AlignmentModel.ELEMENTINFO_SCORE_HOPELESS; // 2006-09-20 } else { newScore = score; } } else { //System.out.println("lengthCorrelation(). length1=" + length1 + "(" + ratio*length1 + "), length2=" + length2 + ", c=" + c); if (c < lowerLimit/2) { newScore = score + 2; } else if (c < lowerLimit) { newScore = score + 1; } else if (c > upperLimit) { newScore = score / 3; } else { newScore = score; // (2005-08-17) } } //System.out.println("@@@ lengthCorrelation(). modifisert score (newScore) =" + newScore); return newScore; } }