package werti.uima.ae; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.Tag; import org.jsoup.select.Elements; import weka.classifiers.Classifier; import weka.core.Attribute; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.SerializationHelper; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Remove; import werti.WERTiContext; import werti.WERTiContext.WERTiContextException; import werti.uima.types.annot.PlainTextSentenceAnnotation; import werti.uima.types.annot.RelevantText; import werti.uima.types.annot.Token; import werti.util.CasUtils; /** * HTML content type annotator: * * Classifies relevant text spans into headline, body, boilerplate, etc. * * @author Adriane Boyd */ public class HTMLContentTypeAnnotator extends JCasAnnotator_ImplBase { private static final Logger log = Logger.getLogger(HTMLContentTypeAnnotator.class); public static final String className = "wertiview"; // margin for double comparison public static final double EPSILON = Math.pow(10, -14); // whether to perform smoothing private static final boolean DO_SMOOTHING = true; // CACHES // (tied to the respective DOM tree) private Element emptyElement; // assigns a unique id to each element private Map id; private int curId; // list of ids of all WERTi spans (will be initialized as soon as we know // how many WERTi spans we have) private int[] wertiIds; private int wertiIdsIdx; // feature cache; access: features[featureName][wertiId] private Map features; // the token annotation for this document private AnnotationIndex tokenIndex; private AnnotationIndex sentIndex; // caches for tokenized text (keys are ids) private List tokenCache; // cache for the div/td word counts; the empty element is the key for the // sum of all words outside any div/td private Map divTdWordCounts; // number of words in the document that are not in an tag private int docNonAnchorWordCount; // the same for every block private List nonAnchorWordCount; // LISTS // list of all feature names private static final String[] featureNames = {"p","p immediate","a", "a immediate","h1-6","list","split p","split br","split inline", "split block","char alpha","char digit","char punct","char white", "char alphaRel","char digitRel","char punctRel","char whiteRel", "char otherRel","token alpha","token digit","token other", "token alphaRel","token digitRel","token otherRel", "token avgTokenLength","token numUpperTokens", "token ratioUpperTokens","token numAllUpperTokens", "token ratioAllUpperTokens","sent numSentences","sent avgNumWords", "sent sentBoundRel","is sentence end","pattern bullet 0 contained", "pattern bullet 1 contained","div/td word ratio","id","idRel", "text density","link density","num words this", "sharesPWithPrePrev","sharesDivWithPrePrev","sharesTdWithPrePrev", "sharesPWithPrev","sharesDivWithPrev","sharesTdWithPrev", "sharesPWithNext","sharesDivWithNext","sharesTdWithNext", "sharesPWithPostNext","sharesDivWithPostNext","sharesTdWithPostNext"}; // The following attribute indices are all zero-based. // attribute index of 'sent numSentences' private static final int numSentAttrIdx = 30; // index of "num words this" in this list private static final int numWordsThisAttrIdx = 42; // number of 'shares*' features private static final int numSharesFeats = 12; // index in the feature name list where the 'shares*' features start private static final int sharesFeatsStartIdx = featureNames.length - numSharesFeats; // attribute indices of the 'shares*' attributes private static final int sharesAttrsStart = 42; // (end is exclusive) private static final int sharesAttrsEnd = sharesAttrsStart+numSharesFeats; // attribute index of the 'Classification' attribute (filled by the first // classifier) private static final int classAttrIdx = 61; // attribute index of the 'annotation' attribute (filled by the 2nd // classifier) private static final int annotationAttrIdx = 64; // names of the features that involve more than one span private static final String[] additionalFeatureNames = {"p/c num words", "p/c text density","|p-c| text density","num words prev", "num words next"}; private static final int numWordsPrevAttrIdx = featureNames.length + 3; private static final int numWordsNextAttrIdx = featureNames.length + 4; // name of the class attribute private static final String classFeatureName = "annotation"; // possible values for the class attribute private static final String[] classFeatureValues = { "Not content_Related content", "Headline", "Supplemental", "Full text_Comments"}; // the CSS classes must not contain spaces or special chars private static final String boilerplate = "boilerplate"; private static final String headline = "headline"; private static final String supplemental = "supplemental"; private static final String content = "content"; private static final String[] cssClasses = {boilerplate, headline, supplemental, content}; private static TreeMap classToCss = null; // useful lists of tags that can be seen as one 'class' of tags private static final String[] TAGCLASS_LIST = {"li", "ul", "ol"}; private static final String[] TAGCLASS_IGNORE = {"script", "noscript", "form", "object", "embed", "head", "meta", "link", "title", "applet", "style"}; // PATTERNS // pattern that matches anything private static final Pattern PAT_ANYTHING = Pattern.compile(".*"); // tag name patterns private static final Pattern PAT_TAGNAME_DIV = Pattern.compile( "[Dd][Ii][Vv]"); private static final Pattern PAT_TAGNAME_TD = Pattern.compile("[Tt][Dd]"); private static final Pattern PAT_TAGNAME_P = Pattern.compile("[Pp]"); // from boilerpipe, for tokenizer private static final Pattern PAT_WORD_BOUNDARY = Pattern.compile("\\b"); private static final Pattern PAT_NOT_WORD_BOUNDARY = Pattern.compile( "[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*"); // matches any tag name except for private static final Pattern PAT_NON_ANCHOR_TAG = Pattern .compile(".*[^a]+.*"); // a token that ends a sentence (matches "." and "word.") private static final Pattern PAT_SENTENCE_END_STRICT = Pattern .compile("[.?!]+"); private static final Pattern PAT_NON_SENTENCE_END = Pattern .compile(".*[^.?!]+"); // character class regexps private static final Pattern PAT_UPPER = Pattern.compile("\\p{Lu}"); private static final Pattern PAT_ALL_UPPER = Pattern.compile("\\p{Lu}+"); private static final Pattern PAT_ALPHAS = Pattern.compile("(\\p{L})+"); private static final Pattern PAT_DIGITS = Pattern.compile("(\\p{N})+"); private static final Pattern PAT_ALPHADIGITS = Pattern .compile("[\\p{L}\\p{N}]+"); private static final Pattern PAT_NON_ALPHAS = Pattern.compile( "[^\\p{L}]+"); private static final Pattern PAT_NON_DIGITS = Pattern.compile( "[^\\p{N}]+"); private static final Pattern PAT_NON_PUNCTS = Pattern.compile( "[^\\p{P}]+"); private static final Pattern PAT_NON_WHITES = Pattern.compile( "[^\\p{Z}]+"); // regexps for the bullet feature private static final Pattern[] PAT_BULLET = { Pattern.compile("\\p{Pd}.*"), Pattern.compile("[\\p{N}\\p{L}]\\..*")}; // for smoothing // J48 cannot handle missing values, so use a replacement instead private static final String MISSING_REPLACEMENT = "missing"; // size of the block n-gram to consider private static final int ngramSize = 5; // index at which the n-gram should start (relative to the current block) private static final int ngramStartIdx = - (ngramSize/2); // what to call the n-gram features: name prefixes private static final String[] ngramPrefixes = {"prePre", "pre", "", "post", "postPost"}; // the first classifier private Classifier cls; // the second classifier (for smoothing) private Classifier smoother; private Instances dataset; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try { log.info("Loading HTML classifier model"); final String modelFileName = WERTiContext.context.getRealPath("/") + WERTiContext.p.getProperty("models.base") + WERTiContext.p.getProperty("htmlcontent.misc"); try { cls = (Classifier) SerializationHelper.read(modelFileName); } catch (Exception e) { e.printStackTrace(); throw new WERTiContextException ("Failed to load HTML content classifier model " + modelFileName); } log.info("Loading HTML smoother model"); final String smootherFileName = WERTiContext.context.getRealPath("/") + WERTiContext.p.getProperty("models.base") + WERTiContext.p.getProperty("htmlsmoother.misc"); try { smoother = (Classifier) SerializationHelper.read( smootherFileName); } catch (Exception e) { e.printStackTrace(); throw new WERTiContextException ("Failed to load HTML content smoother model " + smootherFileName); } } catch (WERTiContextException wce) { throw new ResourceInitializationException(wce); } classToCss = new TreeMap(); // map the internal class names to the CSS class names for (int i=0; i(); curId = 0; wertiIds = null; wertiIdsIdx = 0; features = new HashMap(); tokenIndex = null; sentIndex = null; tokenCache = new ArrayList(); divTdWordCounts = new HashMap(); docNonAnchorWordCount = 0; nonAnchorWordCount = new LinkedList(); dataset = null; } /** * Classifies relevant text spans into headline, body, boilerplate, etc. * * @param cas The document's cas. */ @Override @SuppressWarnings("unchecked") public void process(JCas cas) throws AnalysisEngineProcessException { // stop processing if the client has requested it if (!CasUtils.isValid(cas)) { return; } log.debug("Starting HTML content type annotation"); // no need to reset - we always call reset() at the end of this method // get HTML from CAS final String htmlString = cas.getDocumentText(); // parse with Jsoup final Document doc = Jsoup.parse(htmlString); // find all wertiview spans final Elements wertiTextSpans = doc.getElementsByClass(className); // prepare to iterate over relevant text pieces final AnnotationIndex rtIndex = cas.getAnnotationIndex( RelevantText.type); tokenIndex = cas.getAnnotationIndex(Token.type); sentIndex = cas.getAnnotationIndex(PlainTextSentenceAnnotation.type); // count number of words outside tags and set IDs final Iterator rtIt1 = rtIndex.iterator(); // FIXME not all web pages have a where Jsoup expects one (example: http://www.short-stories.co.uk/) docCounts(doc.body(), false, wertiTextSpans, rtIt1); // initialize wertiIds final int numWertiSpans = wertiTextSpans.size(); wertiIds = new int[numWertiSpans]; // prepare the feature cache for (int i=0; i rtIt = rtIndex.iterator(); for (int index=0; index classificationCache = new ArrayList( dataset.numInstances()); // filter to apply before the first classifier Instances firstDataset = getFirstDataset(); // classify the instances using the first classifier and add the values // for the 'shares*' features for (int i=0; i rtIt2 = rtIndex.iterator(); for (int i=0; i tokenList = tokenize(rt); final List sentList = sentDetect(rt); final int myId = id.get(e); wertiIds[wertiIdsIdx] = myId; // container features features.get("p")[wertiIdsIdx] = (double) toInt(isIn(parents, "p")); features.get("p immediate")[wertiIdsIdx] = (double) toInt( isImmediateParent(immediateParent, "p")); features.get("a")[wertiIdsIdx] = (double) toInt(isIn(parents, "a")); features.get("a immediate")[wertiIdsIdx] = (double) toInt( isImmediateParent(immediateParent, "a")); features.get("h1-6")[wertiIdsIdx] = (double) toInt( isInHeadline(parents)); features.get("list")[wertiIdsIdx] = (double) toInt( isInClass(parents, TAGCLASS_LIST)); // split features final Elements splitElements = getElementsSincePrev(prev, e); features.get("split p")[wertiIdsIdx] = (double) countSplits( splitElements, "p"); features.get("split br")[wertiIdsIdx] = (double) countSplits( splitElements, "br"); final int numBlock = countSplitsBlock(splitElements); features.get("split inline")[wertiIdsIdx] = (double) (splitElements.size()-numBlock); features.get("split block")[wertiIdsIdx] = (double) numBlock; // the letters from the current text String[] letters = new String[1]; // char features final double[] charCounts = charCounts(e.ownText(), letters); String[] charFeatNames = {"char alpha","char digit","char punct", "char white","char alphaRel","char digitRel","char punctRel", "char whiteRel","char otherRel"}; for (int i=0; i,
, or parent with one of its // neighbor blocks? (used only for smoothing) final boolean[] sharesArray = sharesParentWithNeighbor(prePrev, prev, e, next, postNext); for (int i=0; i classCache){ // iterate over all instances for (int i=0; i tags. * Preorder traversal is used. * @param isInA true iff e is an tag itself or contained in one * @param wertiTextSpans all elements marked as wertiview spans * @param rtIt1 an iterator over the RelevantText chunks, for tokenization */ private void docCounts(Element e, boolean isInA, Elements wertiTextSpans, Iterator rtIt1){ // special treatment for tags if ("a".equals(e.tagName())){ isInA = true; } // assign an id id.put(e, curId); curId++; int numTokens = 0; // outside of any tag if ( ! isInA){ // do we have OpenNLP tokenization data for this element? i.e. // is this a relevant text piece? if (wertiTextSpans.contains(e) && rtIt1.hasNext()){ // get those tokens RelevantText rt = rtIt1.next(); List tokenList = tokenize(rt); numTokens = tokenList.size(); } else{ // tokenize 'by hand' String[] tokens = tokenize(e); numTokens = tokens.length; } docNonAnchorWordCount += numTokens; } // cache the non-anchor word count nonAnchorWordCount.add(numTokens); // RECURSION (automatic BASE CASE: e has no children) for (Element child : e.children()){ docCounts(child, isInA, wertiTextSpans, rtIt1); } } /** * returns true iff e is not a WERTi or annotation span and is thus very * likely to have appeared in the original HTML document. */ private static boolean wasInOriginalHtml(Element e){ // we can only recognize our inserted spans by their class names Set classes = e.classNames(); // if it is a werti span, return false return ! classes.contains(className); } /** * returns true iff one of the parents is a tagName tag */ private static boolean isIn(Elements parents, String tagName) { for (Element p : parents) { if (p.tagName().toLowerCase().equals(tagName)) { return true; } } return false; } /** * returns true iff one of the parents is a h1...h6 tag (uses regex) */ private static boolean isInHeadline(Elements parents){ for (Element p : parents) { if (p.tagName().toLowerCase().matches("h[1-6]")) { return true; } } return false; } /** * returns true iff one of the parents is a tag listed in tagList (see * constants above for useful tag lists) */ private static boolean isInClass(Elements parents, String[] tagList){ for (Element p : parents) { for (String tagName : tagList){ if (p.tagName().toLowerCase().equals(tagName)) { return true; } } } return false; } /** * returns the node that was the immediate parent of e in the original * HTML code. Skips all werti and annotation spans. */ private static Element getImmediateParent(Elements parents){ return getImmediateParent(parents, PAT_ANYTHING); } /** * returns the closest parent that matches the regex reTagName. Skips all * werti and annotation spans. */ private static Element getImmediateParent(Elements parents, Pattern tagPattern){ for (Element p : parents){ if (wasInOriginalHtml(p) && tagPattern.matcher(p.tagName()).matches()) return p; } // if we couldn't find any non-werti non-annotation parent, return the // innermost parent return parents.get(0); } /** * returns true iff parent is a tagName tag */ private static boolean isImmediateParent(Element parent, String tagName) { return parent.tagName().toLowerCase().equals(tagName); } /** * return true iff this tag should be ignored. */ private static boolean isIgnored(String tagName){ for (String toIgnore : TAGCLASS_IGNORE){ if (tagName.equals(toIgnore)) return true; } return false; } /** * return all intermediate and leaf nodes encountered on the way from prev * to e (or an empty list if prev is an emptyElement). */ private Elements getElementsSincePrev(Element prev, Element e){ // if prev is empty, return an empty list if (prev.equals(emptyElement)){ return new Elements(); } else{ /* - traverse the whole tree from prev to e * - return a list of all nodes encountered on the way */ // start at prev Element curNode = prev; // rval Elements inBetween = new Elements(); boolean doIgnore = false; boolean breakAllLoops = false; // as soon as we reach e, we are finished while ( ! curNode.equals(e)){ doIgnore = false; // if this is an IGNORE node, ignore it if (isIgnored(curNode.tagName())){ doIgnore = true; } else{ // if it is a werti or annotation span, don't add it if (wasInOriginalHtml(curNode)){ // add the node inBetween.add(curNode); } } // is there a level below the current? if ( ! doIgnore && curNode.children().size() != 0){ // go deeper curNode = curNode.children().first(); } // if there is no deeper level: else{ // is there a next sibling? if (curNode.nextElementSibling() != null){ // go to the next sibling curNode = curNode.nextElementSibling(); } // if there is no next sibling else{ // go up until there is a next sibling while (curNode.nextElementSibling() == null){ curNode = curNode.parent(); // when we reach the root node, stop if (curNode == null) { breakAllLoops = true; break; } // also add the closing tags of these nodes while // we're going up if ( ! isIgnored(curNode.tagName()) && wasInOriginalHtml(curNode)){ inBetween.add(curNode); } } if (breakAllLoops) { break; } // go to that next sibling curNode = curNode.nextElementSibling(); } } } // return the list return inBetween; } } /** * returns the number of html tags with tagName in splitElements. */ private static int countSplits(Elements splitElements, String tagName){ tagName = tagName.toLowerCase(); int count = 0; for (Element splitE : splitElements){ if (splitE.tagName().toLowerCase().equals(tagName)){ count += 1; } } return count; } /** * returns the number of block tags in splitElements. */ private static int countSplitsBlock(Elements splitElements){ int count = 0; for (Element splitE : splitElements){ if (splitE.isBlock()){ count += 1; } } return count; } /** * returns, in this order, the absolute and relative counts of letters, * digits, punctuation chars, whitespace chars, other chars, and vertical * bars. * Also stores an only-letters version of the text in letters. */ private static double[] charCounts(String text, String[] letters){ // total num of chars final int len = text.length(); // throw out all non-letters and store the result in letters String onlyLetters = PAT_NON_ALPHAS.matcher(text).replaceAll(""); letters[0] = onlyLetters; // delete all unwanted chars and get the length of the rest int alpha = onlyLetters.length(); int digit = PAT_NON_DIGITS.matcher(text).replaceAll("").length(); int punct = PAT_NON_PUNCTS.matcher(text).replaceAll("").length(); int white = PAT_NON_WHITES.matcher(text).replaceAll("").length(); int other = text.length() - alpha - digit - punct - white; // compute the relative counts from the absolute counts double alphaRel = (double) alpha / len; double digitRel = (double) digit / len; double punctRel = (double) punct / len; double whiteRel = (double) white / len; double otherRel = (double) other / len; // put all these values in an array and return it double[] rval = {alpha, digit, punct, white, alphaRel, digitRel, punctRel, whiteRel, otherRel}; return rval; } /** * returns, in this order, the absolute and relative counts of all-letter * tokens, all-digit tokens, letter-and-digit tokens and other tokens; the * average length of a token and the number and ratio of tokens starting * with a uppercase character; the number and ratio of tokens that are * all-uppercase; the number of sentences, the average * sentence length (in tokens), and the ratio of sentence boundary markers * to number of tokens. * This method should work with both a naive whitespace tokenizer and a * tokenizer that treats punctuation as tokens. */ private static double[] tokenAndSentenceCounts(String text, List tokenList, List sentList){ final int numTokens = tokenList.size(); final int numSentences = sentList.size(); // count the absolute num of tokens in each class int alpha = 0, digit = 0, other = 0; // sum of all token lengths (to calculate average token length later) int sumTokenLengths = 0; // number of tokens that start with an uppercase letter int numUpperTokens = 0; // number of all-uppercase tokens int numAllUpperTokens = 0; // count words (=tokens that are no sentence boundary marker) int numWords = 0; // traverse all tokens for (Token t : tokenList){ String token = t.getCoveredText(); // the token consist only of letters if (PAT_ALPHAS.matcher(token).matches()){ alpha += 1; } // the token consist only of digits else if (PAT_DIGITS.matcher(token).matches()){ digit += 1; } // the token consist of letters and digits, but no other chars else if (PAT_ALPHADIGITS.matcher(token).matches()){ } // the token contains chars other than letters and digits else{ other += 1; } sumTokenLengths += token.length(); // all-uppercase tokens and ones starting with an uppercase char if (PAT_ALL_UPPER.matcher(token).matches()) numAllUpperTokens += 1; else if (PAT_UPPER.matcher(token).lookingAt()) numUpperTokens += 1; // words (not tokens!) if ( ! PAT_SENTENCE_END_STRICT.matcher(token).matches()) numWords += 1; } // compute the relative numbers of tokens double alphaRel = (double) alpha / numTokens; double digitRel = (double) digit / numTokens; double otherRel = (double) other / numTokens; // average token length double avgTokenLength = (double) sumTokenLengths / numTokens; // number and ratio of uppercase tokens double ratioUpperTokens = (double) numUpperTokens / numTokens; double ratioAllUpperTokens = (double) numAllUpperTokens / numTokens; // compute the average sentence length in words (not tokens!) double avgNumWords = (double) numWords / numSentences; // ratio of sentence boundary markers to number of tokens int sentBound = PAT_NON_SENTENCE_END.matcher(text).replaceAll("") .length(); double sentBoundRel = (double) sentBound / numTokens; // put together an array of all the values and return it double[] rval = {alpha, digit, other, alphaRel, digitRel, otherRel, avgTokenLength, numUpperTokens, ratioUpperTokens, numAllUpperTokens, ratioAllUpperTokens, numSentences, avgNumWords, sentBoundRel}; return rval; } /** * returns true iff text ends with a sentence end character. */ private static boolean isSentenceEnd(String text){ return text.endsWith(".") || text.endsWith("?") || text.endsWith("!"); } /** * returns an array of booleans each indicating whether one of the given * patterns matched and another array of booleans indicating if the * patterns were contained in text. */ private static boolean[][] matchedAndContained(String text, Pattern[] patterns){ boolean[][] rval = new boolean[2][patterns.length]; for (int i=0; i tags */ private int numNonAnchorWords(Element e){ // the empty element doesn't have an ID, but it has 0 tokens anyway if (e.equals(emptyElement)){ return 0; } int eId = id.get(e); int rval = nonAnchorWordCount.get(eId); // RECURSION (automatic BASE CASE: e has no children) for (Element child : e.children()){ rval += numNonAnchorWords(child); } return rval; } /** * returns the number of tokens in the first
or found in * parents. Excludes tokens that are in tags. */ private double divTdWordCount(Elements parents){ // find the closest enclosing div or td in the parents list Element closestDivTd = getClosestDivTd(parents); // if we have seen this div/td before, get the word count if (divTdWordCounts.containsKey(closestDivTd)){ return divTdWordCounts.get(closestDivTd); } // if we haven't seen it before else{ // count the words in the div/td double newWordCount = numNonAnchorWords(closestDivTd); // add it to the storage together with its word count divTdWordCounts.put(closestDivTd, newWordCount); // return the new word count return newWordCount; } } /** * text density as described in Kohlschütter.Frankhauser-10 */ private static double textDensity(List tokenList){ int numWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don't count the first space final int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; for (Token t : tokenList) { numTokens++; numWords++; numWordsCurrentLine++; final int tokenLength = t.getCoveredText().length(); currentLineLength += tokenLength + 1; if (currentLineLength > maxLineLength) { // WRAP numWrappedLines++; currentLineLength = tokenLength; numWordsCurrentLine = 1; } } if (numTokens == 0) { return 0; } int numWordsInWrappedLines; if (numWrappedLines == 0) { return numWords; } else { numWordsInWrappedLines = numWords - numWordsCurrentLine; return (double) numWordsInWrappedLines / numWrappedLines; } } /** * does this block share a parent

,

, or with its neighbor * blocks? * @return array of booleans, ordered by neighbor block */ private boolean[] sharesParentWithNeighbor(Element prePrev, Element prev, Element e, Element next, Element postNext){ // to return boolean[] sharesArray = new boolean[numSharesFeats]; int i = 0; // which parent tag names to check for Pattern[] parentTagNames = {PAT_TAGNAME_P, PAT_TAGNAME_DIV, PAT_TAGNAME_TD}; // this block's parent

,

, and Element[] myParents = new Element[parentTagNames.length]; for (int j=0; j tag, then calculates * number of tokens inside tags / number of tokens */ private double linkDensity(Elements parents, RelevantText rt){ // find closest parent that is not an tag Element parent = getImmediateParent(parents, PAT_NON_ANCHOR_TAG); // count the number of tokens inside tags and the total # of tokens int numInsideA = 0; int numTotal = 0; // breadth-first traversal LinkedList q = new LinkedList(); q.add(parent); while ( ! q.isEmpty()){ Element cur = q.remove(); // tags if (cur.tagName().equals("a")){ // add text and text of children to both counts int curNum = tokenize(cur.text()).length; numInsideA += curNum; numTotal += curNum; } // other tags else{ // add only own text numTotal += tokenize(cur).length; // enqueue children q.addAll(cur.children()); } } // return number of tokens inside tags / number of tokens return (numTotal == 0) ? 0 : ((double) numInsideA / numTotal); } /** * Simple tokenizer copied from boilerpipe: * * Tokenizes the text and returns an array of tokens. * * Try to use the other two tokenize methods if possible. * * @param text * The text * @return The tokens */ private static String[] tokenize(final String text) { /* The empty string would be tokenized to an array containing the empty * string. This array would then have length 1, although there are 0 * tokens in the text. This is why we need this special check. */ if (text.trim().length() == 0){ String[] emptyArray = new String[0]; return emptyArray; } return PAT_NOT_WORD_BOUNDARY .matcher(PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063")) .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim() .split("[ ]+"); } /** * Tokenizes e's OWN text and returns an array of tokens. Works with a * cache. Does not tokenize anything itself, only calls tokenize(String). * @param e * The HTML Element * @return The tokens */ private String[] tokenize(final Element e) { // if e has no id, only tokenize its text if (id.get(e) == null) return tokenize(e.ownText()); int eId = id.get(e); // look up the value in the cache if (eId < tokenCache.size() && tokenCache.get(eId) != null){ return tokenCache.get(eId); } else{ String[] tokens = tokenize(e.ownText()); // resize the cache for (int i=tokenCache.size(); i<=eId; i++) tokenCache.add(null); // add the newly calculated value to the cache tokenCache.set(eId, tokens); return tokens; } } /** * retrieves the tokenization data from OpenNLP */ @SuppressWarnings("unchecked") private List tokenize(RelevantText rt){ // find the number of tokens in this span, using the // information from the existing tokenization final Iterator tokenIt = tokenIndex.subiterator(rt); List tokenList = new ArrayList(); while (tokenIt.hasNext()) { Token t = tokenIt.next(); tokenList.add(t); } return tokenList; } /** * retrieves the sentence segmentation data from OpenNLP */ @SuppressWarnings("unchecked") private List sentDetect(RelevantText rt){ // find all sentences that begin inside this relevant text piece final Iterator sentIt = sentIndex.subiterator(rt, true, false); List sentList = new ArrayList(); while (sentIt.hasNext()){ PlainTextSentenceAnnotation s = sentIt.next(); sentList.add(s); } return sentList; } /** * converts true => 1 and false => 0. */ public static int toInt(boolean bool){ if (bool) return 1; else return 0; } /** * uses both == and .equals(_) for equivalence testing (OR-ed). * @return true iff item is in array. */ public static boolean in(Object item, Object[] array){ // (cannot be replaced with Arrays.binarySearch() // since we need comparison with the equals() method) for (Object o : array){ if (o == item || o.equals(item)) return true; } return false; } }