or found in * parents. Excludes tokens that are in tags. */ private static double divTdWordCount(Elements parents){ // find the closest enclosing div or td in the parents list Element closestDivTd = getClosestDivTd(parents); // if we have seen this div/td before, get the word count if (divTdWordCounts.containsKey(closestDivTd)){ return divTdWordCounts.get(closestDivTd); } // if we haven't seen it before else{ // count the words in the div/td double newWordCount = numNonAnchorWords(closestDivTd); // add it to the storage together with its word count divTdWordCounts.put(closestDivTd, newWordCount); // return the new word count return newWordCount; } } /** * text density as described in Kohlschütter.Frankhauser-10 */ private static double textDensity(List tokenList){ int numWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don't count the first space final int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; for (Token t : tokenList) { numTokens++; numWords++; numWordsCurrentLine++; final int tokenLength = t.getCoveredText().length(); currentLineLength += tokenLength + 1; if (currentLineLength > maxLineLength) { // WRAP numWrappedLines++; currentLineLength = tokenLength; numWordsCurrentLine = 1; //System.out.print("\n" + token); } //else //System.out.print(" " + token); } if (numTokens == 0) { return 0; } int numWordsInWrappedLines; if (numWrappedLines == 0) { return numWords; } else { numWordsInWrappedLines = numWords - numWordsCurrentLine; return (double) numWordsInWrappedLines / numWrappedLines; } } /** * does this block share a parent

, or with its neighbor * blocks? * @return array of booleans, ordered by neighbor block */ private static boolean[] sharesParentWithNeighbor(Element prePrev, Element prev, Element e, Element next, Element postNext){ // to return boolean[] sharesArray = new boolean[numSharesFeats]; int i = 0; // which parent tag names to check for Pattern[] parentTagNames = {PAT_TAGNAME_P, PAT_TAGNAME_DIV, PAT_TAGNAME_TD}; // this block's parent

, and Element[] myParents = new Element[parentTagNames.length]; for (int j=0; j tag, then calculates * number of tokens inside tags / number of tokens */ private static double linkDensity(Elements parents, RelevantText rt){ // find closest parent that is not an tag Element parent = getImmediateParent(parents, PAT_NON_ANCHOR_TAG); // count the number of tokens inside tags and the total # of tokens int numInsideA = 0; int numTotal = 0; // breadth-first traversal LinkedList q = new LinkedList(); q.add(parent); while ( ! q.isEmpty()){ Element cur = q.remove(); // tags if (cur.tagName().equals("a")){ // add text and text of children to both counts int curNum = tokenize(cur.text()).length; numInsideA += curNum; numTotal += curNum; } // other tags else{ // add only own text numTotal += tokenize(cur).length; // enqueue children q.addAll(cur.children()); } } //System.out.print(" parent:<" + parent.tagName() + "> numInsideA:" + numInsideA + " numTotal:" + numTotal + " "); // return number of tokens inside tags / number of tokens return (numTotal == 0) ? 0 : ((double) numInsideA / numTotal); } /** * Simple tokenizer copied from boilerpipe: * * Tokenizes the text and returns an array of tokens. * * Try to use the other two tokenize methods if possible. * * @param text * The text * @return The tokens */ private static String[] tokenize(final String text) { /* The empty string would be tokenized to an array containing the empty * string. This array would then have length 1, although there are 0 * tokens in the text. This is why we need this special check. */ if (text.trim().length() == 0){ String[] emptyArray = new String[0]; return emptyArray; } return PAT_NOT_WORD_BOUNDARY .matcher(PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063")) .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim() .split("[ ]+"); //return TOKENIZER.tokenize(text); } /** * Tokenizes e's OWN text and returns an array of tokens. Works with a * cache. Does not tokenize anything itself, only calls tokenize(String). * @param e * The HTML Element * @return The tokens */ private static String[] tokenize(final Element e) { // if e has no id, only tokenize its text if (id.get(e) == null) return tokenize(e.ownText()); int eId = id.get(e); // look up the value in the cache if (eId < tokenCache.size() && tokenCache.get(eId) != null){ return tokenCache.get(eId); } else{ String[] tokens = tokenize(e.ownText()); // resize the cache /// this takes long, probably better use a HashMap? for (int i=tokenCache.size(); i<=eId; i++) tokenCache.add(null); // add the newly calculated value to the cache tokenCache.set(eId, tokens); return tokens; } } /** * retrieves the tokenization data from OpenNLP */ @SuppressWarnings("unchecked") private static List tokenize(RelevantText rt){ // find the number of tokens in this span, using the // information from the existing tokenization final Iterator tokenIt = tokenIndex.subiterator(rt); List tokenList = new ArrayList(); while (tokenIt.hasNext()) { Token t = tokenIt.next(); tokenList.add(t); } return tokenList; } /** * retrieves the sentence segmentation data from OpenNLP */ @SuppressWarnings("unchecked") private static List sentDetect(RelevantText rt){ // find all sentences that begin inside this relevant text piece final Iterator sentIt = sentIndex.subiterator(rt, true, false); List sentList = new ArrayList(); while (sentIt.hasNext()){ PlainTextSentenceAnnotation s = sentIt.next(); sentList.add(s); //log.debug("sentBegin: {}", s.getBegin()); } return sentList; } /** * converts true => 1 and false => 0. */ public static int toInt(boolean bool){ if (bool) return 1; else return 0; } /** * uses both == and .equals(_) for equivalence testing (OR-ed). * @return true iff item is in array. */ public static boolean in(Object item, Object[] array){ // (cannot be made efficient or replaced with Arrays.binarySearch() // since we need comparison with the equals() method) for (Object o : array){ if (o == item || o.equals(item)) return true; } return false; } }