Commit 97e88225 authored by JR's avatar JR

Building trie now needs that user enter the Tokeniser as a configuration

(_gateTokeniserPR) in gazetteers file.
parent f047837c
......@@ -12,3 +12,4 @@ build.properties
/bin/
hs_err_*.log
/teste_manifest_img
......@@ -7,4 +7,4 @@ This plugin provides a PR for the GATE framework to perform approximate string a
Edit Distance search and string similarity metrics.
For more information visit <a href="https://gitlab.c3sl.ufpr.br/faes/asm/wikis/home">WIKI</a>.
Reports and doubts can be send to: jferri@inf.ufpr.br
\ No newline at end of file
For reports and doubts email to: jferri@inf.ufpr.br
\ No newline at end of file
......@@ -23,6 +23,7 @@ package br.ufpr.inf.junior.imgazetteer;
import java.awt.event.ActionEvent;
import java.io.BufferedReader;
import java.io.File;
import java.io.PrintWriter;
import java.lang.management.ManagementFactory;
import java.lang.reflect.Method;
import java.util.ArrayList;
......@@ -226,6 +227,19 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
private String editDistanceFeatureName;
@RunTime
@Optional
@CreoleParameter(comment = "Tokeniser that will be used on entries from gazetteers file.", defaultValue = "")
public void setTokeniserGatePR(String tgPR) {
tokeniserGatePR = tgPR;
}
public String getTokeniserGatePR() {
return tokeniserGatePR;
}
private String tokeniserGatePR;
@RunTime
@Optional
@CreoleParameter(comment = "Path of class that contains the method to calculate string similarity.", defaultValue = "")
......@@ -391,10 +405,14 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
*/
protected void loadDataFromDictionary() throws Exception {
if (getConfigFileURL() != null) {
List<String> gazetteerEntriesList = new ArrayList<>();
logger.info("Creating gazetteer from " + getConfigFileURL());
logger.info("Creating trie from " + getConfigFileURL());
BufferedReader dictionaryBuffReader = new BomStrippingInputStreamReader((configFileURL).openStream(),
encoding);
// informa a mudana do status.
fireStatusChanged("Building trie ...");
// Load configuration header.
HashMap<String, String> listConfigFromDictionary = new HashMap<>();
......@@ -429,6 +447,11 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
else
throw new ExecutionException("Your gazetteer miss a configuration: _editDistanceFeatureName");
if (listConfigFromDictionary.containsKey("_gateTokeniserPR"))
setTokeniserGatePR(listConfigFromDictionary.get("_gateTokeniserPR"));
else
throw new ExecutionException("Your gazetteer miss a configuration: _gateTokeniserPR");
if (listConfigFromDictionary.containsKey("_similarityClass"))
setSimilarityClass(listConfigFromDictionary.get("_similarityClass"));
else
......@@ -480,19 +503,30 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
this.gazetteerFeaturesMap = makeTrie.loadEntriesAndFeatures(configFileURL, this.entrieDelimiter,
this.featureNameValueSeparator, this.featuresSeparator, encoding);
HashMap<Integer, Integer> tamanhoLinhaMap = new HashMap<>();
Integer linhaCount = 0;
File configFile = gate.util.Files.fileFromURL(configFileURL);
String entriesFileName = configFile.getAbsolutePath().replaceAll("\\.txt$", "\\_GAZETTEERS.txt");
PrintWriter writer = new PrintWriter(entriesFileName, encoding);
for (Map.Entry<String, List<GazetteerFeature>> element : gazetteerFeaturesMap.entrySet()) {
String s = new String();
linhaCount++;
String entrieNormalized = element.getKey().trim();
entrieNormalized = ws_pattern.matcher(entrieNormalized).replaceAll(" ");
gazetteerEntriesList.add(entrieNormalized);
tamanhoLinhaMap.put(linhaCount, entrieNormalized.length());
writer.println(entrieNormalized);
}
writer.close();
// Build trie
this.trie = new NodeFT(null, Boolean.FALSE, new ArrayList<>(), 0);
this.trie = new TrieUtil().buildTrie(gazetteerEntriesList, trie, this.useTransformation, encoding,
this.transformationClass, this.transformationMethod);
this.transformationClass, this.transformationMethod, entriesFileName, tamanhoLinhaMap,
this.tokeniserGatePR);
this.deepestLevel = trie.getLevel();
this.trie.setLevel(0);
fireProcessFinished();
fireStatusChanged("TRIE is complete.");
logger.info("Trie depth:" + deepestLevel);
logger.info("TRIE is complete.");
} else {
......@@ -577,8 +611,8 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
// Annotate the document.
ArrayList<SimilarityResult> similarityResults = new ArrayList<>();
for (EditDistanceResult edr : editDistanceResultList) {
similarityResults = auxiliarStringSimilarity.secondMatch(edr, m, obj,
this.minAcceptedSimilarity, this.numberBetterSimilarities, caseSensitive);
similarityResults = auxiliarStringSimilarity.secondMatch(edr, m, obj, this.minAcceptedSimilarity,
this.numberBetterSimilarities, caseSensitive);
if (similarityResults != null && !similarityResults.isEmpty()) {
createLookups(edr, similarityResults);
similarityResults.clear();
......@@ -648,12 +682,14 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
}
private List<Action> actions;
@Override
public List<Action> getActions() {
if (actions == null) {
actions = new ArrayList<Action>();
actions.add(new AbstractAction("Initialize") {
private static final long serialVersionUID = 1L;
@Override
public void actionPerformed(ActionEvent evt) {
File configFile = gate.util.Files.fileFromURL(getConfigFileURL());
......
......@@ -21,6 +21,7 @@
package br.ufpr.inf.junior.imgazetteer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import br.ufpr.inf.junior.utils.StringUtil;
......@@ -35,6 +36,7 @@ public class Search {
private Integer maxEd;
private final NodeFT trie;
List<String> activeNodesForCharList = new ArrayList<>();
public Search(NodeFT trie, Integer maxEd) {
this.trie = trie;
......@@ -60,7 +62,7 @@ public class Search {
List<EditDistanceResult> editDistanceResultList = new ArrayList<>();
// Create an array, which size is the maximum level of trie.
// Each position of this array store a list, which will store
// Each position of this array store a list, which will store
// active nodes of same level.
List<ArrayList<NodeFT>> activeNodesArray = new ArrayList<>();
for (int i = 0; i <= deepestLevel; i++) {
......@@ -82,7 +84,7 @@ public class Search {
List<Character> convertedTokenArray = new ArrayList<>();
while (i < tokenList.size()) {
Annotation tokenAnnotation = tokenList.get(i);
// Space token annotations are considered single blank space.
// Space token annotations are considered single blank space.
if (tokenAnnotation.getType().equals(ANNIEConstants.SPACE_TOKEN_ANNOTATION_TYPE)) {
convertedToken += " ";
chunkRead.setEndOffset(tokenAnnotation.getEndNode().getOffset().intValue());
......@@ -122,12 +124,19 @@ public class Search {
if (caseSensitive) {
for (char c : convertedTokenArray) {
activeNodesArray = runSearchOnTrieCaseSensitive(c, activeNodesArray, deepestLevel);
this.activeNodesForCharList.add(c + ":" + countActiveNodes(activeNodesArray));
}
} else {
for (char c : convertedTokenArray) {
activeNodesArray = runSearchOnTrieNoCase(c, activeNodesArray, deepestLevel);
if (countActiveNodes(activeNodesArray) > 0) {
activeNodesArray = runSearchOnTrieNoCase(c, activeNodesArray, deepestLevel);
this.activeNodesForCharList.add(c + ":" + countActiveNodes(activeNodesArray));
} else {
break;
}
}
}
// Check which nodes are end of entries.
activLeavesList = checkIsLeaf(activeNodesArray);
......@@ -135,9 +144,18 @@ public class Search {
editDistanceResultList.add(new EditDistanceResult(activLeavesList, new InputString(chunkRead)));
}
// If none active node has child nodes, do not add another token to this chunk.
// Read chunk is cleared and the search start a new chunk using next token.
// If none active node has child nodes, do not add another token
// to this chunk.
// Read chunk is cleared and the search start a new chunk using
// next token.
if (!thereIsMoreChildForNodeArray(activeNodesArray) && tempIndex < tokenList.size()) {
// Print LOG of active nodes
System.out.println("Chunk: " + chunkRead.getStringBuffer().toString());
for (String anfcl : activeNodesForCharList) {
System.out.println(anfcl);
}
i = tempIndex;
chunkRead.setStringBuffer(new StringBuffer());
chunkRead.setStartOffset(tokenList.get(i).getStartNode().getOffset().intValue());
......@@ -145,7 +163,7 @@ public class Search {
chunkRead.setNextCharacter(null);
nextStartDefined = false;
this.trie.setEDValue(0);// Set root's EDValue to zero.
this.trie.setEDValue(0);// Set root's EDValue to zero.
// Active nodes list is emptied.
for (ArrayList<NodeFT> nodeList : activeNodesArray) {
for (NodeFT n : nodeList) {
......@@ -156,6 +174,7 @@ public class Search {
}
(activeNodesArray.get(this.trie.getLevel())).add(this.trie);
this.trie.setValidNode(true);
this.activeNodesForCharList.clear();
} else {
i++;// Moves to next token.
}
......@@ -177,8 +196,8 @@ public class Search {
}
/**
* Based on active nodes, each character is searched in trie.
* The search start from deepper/higher level nodes.
* Based on active nodes, each character is searched in trie. The search
* start from deepper/higher level nodes.
*
* This is the case sensitive search.
*
......@@ -244,8 +263,10 @@ public class Search {
break;
}
for (NodeFT nodeB : listTemp1) {
/* An active node always kept the lowest value for
* ED that it is allowed.
/*
* An active node always kept the
* lowest value for ED that it is
* allowed.
*/
if (nodeB.isValidNode()) {
if (nodeB.getEDValue() > intAux) {
......@@ -263,7 +284,8 @@ public class Search {
listTemp2.clear();
}
}
} else {// Child node's character is different of searched character.
} else {// Child node's character is different of
// searched character.
if (valueEDAux < this.maxEd) {
if (childNode.isValidNode()) {
if (childNode.getEDValue() > valueEDAux + 1) {
......@@ -285,8 +307,8 @@ public class Search {
}
/*
* For nodes that already was active before the search of last character,
* the ED value is increased by 1.
* For nodes that already was active before the search of
* last character, the ED value is increased by 1.
*/
if (rootNode.getEDValue() + 1 <= this.maxEd) {
rootNode.setEDValue(rootNode.getEDValue() + 1);
......@@ -302,8 +324,8 @@ public class Search {
}
/**
* Based on active nodes, each character is searched in trie.
* The search start from deepper/higher level nodes.
* Based on active nodes, each character is searched in trie. The search
* start from deepper/higher level nodes.
*
* This is the NO case sensitive search.
*
......@@ -369,8 +391,10 @@ public class Search {
break;
}
for (NodeFT nodeB : listTemp1) {
/* An active node always kept the lowest value for
* ED that it is allowed.
/*
* An active node always kept the
* lowest value for ED that it is
* allowed.
*/
if (nodeB.isValidNode()) {
if (nodeB.getEDValue() > intAux) {
......@@ -388,7 +412,8 @@ public class Search {
listTemp2.clear();
}
}
} else {// Child node's character is different of searched character.
} else {// Child node's character is different of
// searched character.
if (valueEDAux < this.maxEd) {
if (childNode.isValidNode()) {
if (childNode.getEDValue() > valueEDAux + 1) {
......@@ -410,8 +435,8 @@ public class Search {
}
/*
* For nodes that already was active before the search of last character,
* the ED value is increased by 1.
* For nodes that already was active before the search of
* last character, the ED value is increased by 1.
*/
if (rootNode.getEDValue() + 1 <= this.maxEd) {
rootNode.setEDValue(rootNode.getEDValue() + 1);
......@@ -464,4 +489,18 @@ public class Search {
return false;
}
/**
* Return the number of active nodes.
*
* @param list
* @return List<NodeFT>
*/
private int countActiveNodes(List<ArrayList<NodeFT>> list) {
int numberActiveNoes = 0;
for (List<NodeFT> al : list) {
numberActiveNoes += al.size();
}
return numberActiveNoes;
}
}// END class
......@@ -20,11 +20,13 @@
package br.ufpr.inf.junior.utils;
//import static br.com.ftdistance.vo.Main.GAZETTEERS_URL;
import java.io.BufferedReader;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
......@@ -32,7 +34,15 @@ import br.ufpr.inf.junior.imgazetteer.Entry;
import br.ufpr.inf.junior.imgazetteer.GazetteerFeature;
import br.ufpr.inf.junior.imgazetteer.InputString;
import br.ufpr.inf.junior.imgazetteer.NodeFT;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.DocumentContent;
import gate.Factory;
import gate.LanguageAnalyser;
import gate.creole.ANNIEConstants;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;
/**
......@@ -63,70 +73,120 @@ public class TrieUtil {
* @throws Exception
*/
public NodeFT buildTrie(List<String> gazetteerList, NodeFT root, boolean useTransformation, String encoding,
String transformationClass, String transformationMethod) throws Exception {
String transformationClass, String transformationMethod, String entriesFilePath,
HashMap<Integer, Integer> tamanhoLinhaMap, String tokeniserPR) throws Exception {
int deepestLevel = 0;
int numberChars = 0;
if (gazetteerList != null && gazetteerList.size() > 0) {
java.net.URL entriesFileURL = new File(entriesFilePath).toURI().toURL();
Document doc = Factory.newDocument(entriesFileURL);
LanguageAnalyser DefaultTokeniserPR = (LanguageAnalyser) Factory
.createResource(tokeniserPR);
// Create tokeniser
DefaultTokeniserPR.setDocument(doc);
DefaultTokeniserPR.setCorpus(null); // set the corpus to null
DefaultTokeniserPR.execute(); // execute the PR
AnnotationSet tokenAnnotations = doc.getAnnotations();
Factory.deleteResource(DefaultTokeniserPR);
Factory.deleteResource(doc);
List<Annotation> tokenAnnotationsList = new ArrayList<Annotation>();
tokenAnnotationsList.addAll(tokenAnnotations);
Collections.sort(tokenAnnotationsList, OFFSET_COMPARATOR);
int lineCount = 0;
int currentIndex = 0;
Integer lineMaxOffSet = 0;
while (currentIndex < tokenAnnotationsList.size()) {
List<String> entrieTokens = new ArrayList<>();
lineCount++;
if (tamanhoLinhaMap.containsKey(lineCount)) {
lineMaxOffSet += (tamanhoLinhaMap.get(lineCount));
} else {
break;
}
for (int x = currentIndex; x < tokenAnnotationsList.size(); x++) {
Annotation ann = tokenAnnotationsList.get(x);
if (ann.getEndNode().getOffset().intValue() > lineMaxOffSet) {
break;
} else {
entrieTokens.add((String) ann.getFeatures().get(ANNIEConstants.TOKEN_STRING_FEATURE_NAME));
currentIndex = x;
}
}
StringBuffer sb = new StringBuffer();
for (String tempToken : entrieTokens) {
sb.append(tempToken);
}
NodeFT currentlyNodeFT;
int currentPosition = 0;
int length = 0;
int levelInTrie = 0;
Integer nodeId = 0;
String convertedString = new String();
List<InputString> tokenList = new ArrayList();
List<String> convertedTokenList = new ArrayList();
String convertedToken = new String();
for (String s : gazetteerList) {
if (!useTransformation) {
convertedString = s;
} else {// Transformation algorithm will be
// used.
/*
* Entries are divided in tokens using an algorithm
* developed in this project. Do not use annie gazetteer (for now).
*/
convertedTokenList.clear();
tokenList.clear();
// Get tokens from an entrie.
tokenList = StringUtil.string2TokenList(s);
// Tokens are transformed.
for (InputString tempToken : tokenList) {
numberChars += tempToken.getStringBuffer().length();
// ONLY apply transformation on tokens that start with a
// letter.
if (Character.isLetter(tempToken.getStringBuffer().charAt(0))) {
convertedToken = StringUtil.stringTransformation(tempToken.getStringBuffer().toString(),
transformationClass, transformationMethod);
} else {
convertedToken = tempToken.getStringBuffer().toString();
}
convertedTokenList.add(convertedToken);
}
if (!useTransformation) {
convertedString = sb.toString();
} else {// Apply transformation algorithm.
convertedTokenList.clear();
for (String tempToken : entrieTokens) {
// Create a single string using transformed tokens from an
// entrie.
StringBuffer stringBufferTemp = new StringBuffer();
for (String stringTemp : convertedTokenList) {
stringBufferTemp.append(stringTemp);
numberChars += tempToken.length();
// ONLY apply transformation on tokens that start
// with a letter.
if (Character.isLetter(tempToken.charAt(0))) {
convertedToken = StringUtil.stringTransformation(tempToken.toString(), transformationClass,
transformationMethod);
} else {
convertedToken = tempToken;
}
convertedString = stringBufferTemp.toString();
convertedTokenList.add(convertedToken);
}
// Create a single string using transformed tokens from
// an entrie.
StringBuffer convertedStringBuffer = new StringBuffer();
for (String stringTemp : convertedTokenList) {
convertedStringBuffer.append(stringTemp);
}
convertedString = convertedStringBuffer.toString();
}
currentlyNodeFT = root;
boolean foundChild = false;
currentPosition = 0;
length = convertedString.length();
// Use each entrie character to buil the trie.
while (currentPosition < length) {
Character c = convertedString.charAt(currentPosition);
levelInTrie = currentlyNodeFT.getLevel() + 1;
foundChild = false;
// If currently node doesn't has any child.
if (currentlyNodeFT.getChilds() == null || currentlyNodeFT.getChilds().isEmpty()) {
currentlyNodeFT = root;
boolean foundChild = false;
currentPosition = 0;
length = convertedString.length();
// Use each entrie character to buil the trie.
while (currentPosition < length) {
Character c = convertedString.charAt(currentPosition);
levelInTrie = currentlyNodeFT.getLevel() + 1;
foundChild = false;
// If currently node doesn't has any child.
if (currentlyNodeFT.getChilds() == null || currentlyNodeFT.getChilds().isEmpty()) {
NodeFT child = createNewChildNode(c, levelInTrie, nodeId);
currentlyNodeFT.getChilds().add(child);
currentlyNodeFT = child;
// Get the deepest level of trie.
if (deepestLevel < levelInTrie) {
deepestLevel = levelInTrie;
}
} else {
for (NodeFT nFT : currentlyNodeFT.getChilds()) {
if (nFT.getSymbol().equals(c)) {
currentlyNodeFT = nFT;
foundChild = true;
break;// Stop looping on nodes set.
}
}
// If there is no node to that character.
if (!foundChild) {
NodeFT child = createNewChildNode(c, levelInTrie, nodeId);
currentlyNodeFT.getChilds().add(child);
currentlyNodeFT = child;
......@@ -134,33 +194,17 @@ public class TrieUtil {
if (deepestLevel < levelInTrie) {
deepestLevel = levelInTrie;
}
} else {
for (NodeFT nFT : currentlyNodeFT.getChilds()) {
if (nFT.getSymbol().equals(c)) {
currentlyNodeFT = nFT;
foundChild = true;
break;// Stop looping on nodes set.
}
}
// If there is no node to that character.
if (!foundChild) {
NodeFT child = createNewChildNode(c, levelInTrie, nodeId);
currentlyNodeFT.getChilds().add(child);
currentlyNodeFT = child;
// Get the deepest level of trie.
if (deepestLevel < levelInTrie) {
deepestLevel = levelInTrie;
}
}
}
currentPosition++;
nodeId++;
}
// Set some values to node that represent
// last string's character.
currentlyNodeFT.setEndEntrie(true);
currentlyNodeFT.addEntry(new Entry(s));
currentPosition++;
nodeId++;
}
// Set some values to node that represent
// last string's character.
currentlyNodeFT.setEndEntrie(true);
currentlyNodeFT.addEntry(new Entry(sb.toString()));
lineMaxOffSet += 2;//Line break
currentIndex += 2;//Line break
}
root.setLevel(deepestLevel);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment