Commit 6bb269ab authored by JR's avatar JR

Alterações na aplicação de transformação em tokens.

parent 8c35264c
......@@ -5,5 +5,6 @@
<classpathentry kind="con" path="org.eclipse.jdt.USER_LIBRARY/GATE"/>
<classpathentry kind="lib" path="E:/workspace/libraries/apache_log4j_1_2_17/log4j-1.2.17.jar"/>
<classpathentry kind="lib" path="E:/Programas/GATE/plugins/InexactGazetteer/lib/lucene-suggest-5.2.1.jar"/>
<classpathentry kind="lib" path="E:/Programas/GATE/plugins/InexactGazetteer/lib/StringSim.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>
......@@ -338,10 +338,11 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
@Override
public Resource init() throws ResourceInstantiationException {
// precompile the pattern used to replace all unicode whitespace in gazetteer
// entries with a single space.
ws_pattern = Pattern.compile(ws_patternstring);
// precompile the pattern used to replace all unicode whitespace in
// gazetteer
// entries with a single space.
ws_pattern = Pattern.compile(ws_patternstring);
createTrie();
return this;
}
......@@ -365,6 +366,7 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
}
/**
* Use dictionary file to configure PR and to build the trie.
*
* @throws Exception
*/
......@@ -450,13 +452,13 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
setGazetterDelimiter(listConfigFromDictionary.get("_entrieDelimiter"));
else
throw new ExecutionException("Your gazetteer miss a configuration: _entrieDelimiter");
if (listConfigFromDictionary.containsKey("_caseSensitive"))
setCaseSensitive(listConfigFromDictionary.get("_caseSensitive").equalsIgnoreCase("YES") ? true : false);
TrieUtil makeTrie = new TrieUtil();
// Carrega um map com as features de cada gazetteer.
this.gazetteerFeaturesMap = makeTrie.loadGazetteersAndFeatures(configFileURL, this.entrieDelimiter,
// Create a MAP with entries and their features.
this.gazetteerFeaturesMap = makeTrie.loadEntriesAndFeatures(configFileURL, this.entrieDelimiter,
this.featureNameValueSeparator, this.featuresSeparator, encoding);
for (Map.Entry<String, List<GazetteerFeature>> element : gazetteerFeaturesMap.entrySet()) {
......@@ -466,9 +468,9 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
gazetteerEntriesList.add(entrieNormalized);
}
// Create trie
// Build trie
this.trie = new NodeFT(null, Boolean.FALSE, new ArrayList<>(), 0);
this.trie = new TrieUtil().fillTrie(gazetteerEntriesList, trie, this.useTransformation, encoding,
this.trie = new TrieUtil().buildTrie(gazetteerEntriesList, trie, this.useTransformation, encoding,
this.transformationClass, this.transformationMethod);
this.deepestLevel = trie.getLevel();
this.trie.setLevel(0);
......@@ -546,30 +548,27 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
this.transformationClass, this.transformationMethod, this.caseSensitive);
if (editDistanceResultList != null && !editDistanceResultList.isEmpty()) {
String nomeClasseSimilaridade = getSimilarityClass();
String nomeMetodoSimilaridade = getSimilarityMethod();
// Create similarity class
Class c = Class.forName(nomeClasseSimilaridade);
Class c = Class.forName(getSimilarityClass());
// New Object.
Object obj = c.newInstance();
// Create similarity method.
Method metodoComparacao = c.getMethod(nomeMetodoSimilaridade, String.class, String.class);
Method m = c.getMethod(getSimilarityMethod(), String.class, String.class);
// Annotate the document.
ArrayList<SimilarityResult> similarityResultsReturn = new ArrayList<>();
ArrayList<SimilarityResult> similarityResults = new ArrayList<>();
for (EditDistanceResult edr : editDistanceResultList) {
similarityResultsReturn = auxiliarStringSimilarity.secondMatch(edr, metodoComparacao, obj,
similarityResults = auxiliarStringSimilarity.secondMatch(edr, m, obj,
this.minAcceptedSimilarity, this.numberBetterSimilarities, caseSensitive);
if (similarityResultsReturn != null && !similarityResultsReturn.isEmpty()) {
createLookups(edr, similarityResultsReturn);
similarityResultsReturn.clear();
if (similarityResults != null && !similarityResults.isEmpty()) {
createLookups(edr, similarityResults);
similarityResults.clear();
}
}
}
} catch (Exception e) {
logger.info(e.getMessage());
e.printStackTrace();
throw new ExecutionException(e.getMessage() + e.getCause().getMessage());
}
......@@ -580,11 +579,11 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
/**
*
* @param edr
* @param similarityResultsReturn
* @param similarityResults
*/
protected void createLookups(EditDistanceResult edr, ArrayList<SimilarityResult> similarityResultsReturn) {
protected void createLookups(EditDistanceResult edr, ArrayList<SimilarityResult> similarityResults) {
Integer lookupid;
for (SimilarityResult sr : similarityResultsReturn) {
for (SimilarityResult sr : similarityResults) {
FeatureMap fm = Factory.newFeatureMap();
fm.put(this.similarityFeatureName, sr.similarity);
fm.put("String", edr.getStringRead().getStringBuffer().toString());
......@@ -630,26 +629,20 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
}
private List<Action> actions;
@Override
public List<Action> getActions() {
if (actions == null) {
actions = new ArrayList<Action>();
// Action 1: remove the gazbin file and re-initialize the gazetteer
actions.add(new AbstractAction("Initialize") {
private static final long serialVersionUID = 1L;
@Override
public void actionPerformed(ActionEvent evt) {
File configFile = gate.util.Files.fileFromURL(getConfigFileURL());
String configFileName = configFile.getAbsolutePath();
try {
// reInit();
init();
} catch (ResourceInstantiationException ex) {
throw new GateRuntimeException("Re-initialization failed", ex);
} catch (ResourceInstantiationException e) {
throw new GateRuntimeException("Initialization failed", e);
}
}
});
......
......@@ -75,10 +75,11 @@ public class Search {
stringRead.setStringBuffer(stringRead.getStringBuffer().append(" "));
i++;
} else {
if (tokenAnnotation.getFeatures().get(ANNIEConstants.TOKEN_KIND_FEATURE_NAME).equals("word")) {
convertedToken += StringUtil.convert(
if (tokenAnnotation.getFeatures().get(ANNIEConstants.TOKEN_KIND_FEATURE_NAME).equals("word")
&& useTransformation) {
convertedToken += StringUtil.stringTransformation(
(String) tokenAnnotation.getFeatures().get(ANNIEConstants.TOKEN_STRING_FEATURE_NAME),
useTransformation, convertionClass, convertionMethod);
convertionClass, convertionMethod);
} else {
convertedToken += (String) tokenAnnotation.getFeatures()
.get(ANNIEConstants.TOKEN_STRING_FEATURE_NAME);
......@@ -180,8 +181,8 @@ public class Search {
* @return
* @throws Exception
*/
private List<ArrayList<NodeFT>> runSearchOnTrieCaseSensitive(Character currentChar, List<ArrayList<NodeFT>> activNodesArray,
int deepestLevel) throws Exception {
private List<ArrayList<NodeFT>> runSearchOnTrieCaseSensitive(Character currentChar,
List<ArrayList<NodeFT>> activNodesArray, int deepestLevel) throws Exception {
List<ArrayList<NodeFT>> activNodesArrayTemp = new ArrayList<ArrayList<NodeFT>>();
for (int i = 0; i <= deepestLevel; i++) {
......@@ -282,7 +283,7 @@ public class Search {
}
// O n que j estava ativo antes de iniciar as
// associaes para o caractere atual tem o
// associaes para o caractere atual tem o
// EDValue incrementado.
if (rootNode.getEDValue() + 1 <= this.maxEd) {
rootNode.setEDValue(rootNode.getEDValue() + 1);
......@@ -405,7 +406,7 @@ public class Search {
}
// O nó que j estava ativo antes de iniciar as
// associaes para o caractere atual tem o
// associaes para o caractere atual tem o
// EDValue incrementado.
if (rootNode.getEDValue() + 1 <= this.maxEd) {
rootNode.setEDValue(rootNode.getEDValue() + 1);
......@@ -421,8 +422,7 @@ public class Search {
}
/**
* Return a list of activate nodes that indicate the end of an
* entry.
* Return a list of activate nodes that indicate the end of an entry.
*
* @param list
* @return List<NodeFT>
......
......@@ -20,7 +20,6 @@ import br.ufpr.inf.junior.imgazetteer.InputString;
*/
public class StringUtil {
/**
* Invoke transformation method.
*
......@@ -31,106 +30,97 @@ public class StringUtil {
* @throws IllegalArgumentException
* @return String
*/
public static String convert(String originalString, boolean useTransformation, String convertionClass,
String convertionMethod) throws IllegalArgumentException {
if (useTransformation) {
String convertedString = new String();
try {
// Create class
Class c = Class.forName(convertionClass);
// Instantiate class
Object obj = c.newInstance();
// Create method
Method metodoConversor = c.getMethod(convertionMethod, String.class);
// Invoke method
convertedString = (String) metodoConversor.invoke(obj, originalString);
} catch (NoSuchMethodException | SecurityException | InvocationTargetException | ClassNotFoundException
| IllegalAccessException | InstantiationException ex) {
Logger.getLogger(StringUtil.class.getName()).log(Level.SEVERE, ex.getMessage(), ex);
}
public static String stringTransformation(String originalString, String convertionClass, String convertionMethod)
throws IllegalArgumentException {
String convertedString = new String();
try {
// Create class
Class c = Class.forName(convertionClass);
// Instantiate class
Object obj = c.newInstance();
// Create method
Method m = c.getMethod(convertionMethod, String.class);
// Invoke method
convertedString = (String) m.invoke(obj, originalString);
return convertedString;
} else {
return originalString;
} catch (NoSuchMethodException | SecurityException | InvocationTargetException | ClassNotFoundException
| IllegalAccessException | InstantiationException ex) {
Logger.getLogger(StringUtil.class.getName()).log(Level.SEVERE, ex.getMessage(), ex);
}
return convertedString;
}
/**
*
* Return a list containing tokens from received string.
*
* @param stringIn
* @return List
*/
public static List<InputString> string2TokenList(String stringIn) {
InputString readString = new InputString();
boolean isNewToken = false;
List<InputString> tokenList = new ArrayList<>();
int index = 0;
StringBuffer stringBufferTemp = new StringBuffer();
try {
if (stringIn != null && stringIn.length() > 0) {
stringBufferTemp.append(stringIn.charAt(index));
index = 1;
while (index < stringIn.length()) {
isNewToken = isEndOfToken(stringIn.charAt(index - 1), stringIn.charAt(index));
if (isNewToken) {
readString.setStringBuffer(stringBufferTemp);
readString.setEndOffset(index - 1);
readString.setStartOffset((index) - (stringBufferTemp.length()));
tokenList.add(readString);
readString = new InputString();
stringBufferTemp = new StringBuffer();
}
stringBufferTemp.append(stringIn.charAt(index));
index++;
}
//Adiciona o ltimo token do texto na lista.
readString.setStringBuffer(stringBufferTemp);
readString.setStartOffset((index) - (stringBufferTemp.length()));
readString.setEndOffset(index - 1);
tokenList.add(readString);
}
} catch (Exception e) {
System.err.println("text2TokenList " + e.getMessage());
}
/**
*
* Return a list containing tokens from received string.
*
* @param stringIn
* @return List
*/
public static List<InputString> string2TokenList(String stringIn) {
InputString readString = new InputString();
boolean isNewToken = false;
List<InputString> tokenList = new ArrayList<>();
int index = 0;
StringBuffer stringBufferTemp = new StringBuffer();
return tokenList;
}
try {
if (stringIn != null && stringIn.length() > 0) {
stringBufferTemp.append(stringIn.charAt(index));
index = 1;
while (index < stringIn.length()) {
isNewToken = isEndOfToken(stringIn.charAt(index - 1), stringIn.charAt(index));
if (isNewToken) {
readString.setStringBuffer(stringBufferTemp);
readString.setEndOffset(index - 1);
readString.setStartOffset((index) - (stringBufferTemp.length()));
tokenList.add(readString);
readString = new InputString();
stringBufferTemp = new StringBuffer();
}
stringBufferTemp.append(stringIn.charAt(index));
index++;
}
// Add text's last token to the list.
readString.setStringBuffer(stringBufferTemp);
readString.setStartOffset((index) - (stringBufferTemp.length()));
readString.setEndOffset(index - 1);
tokenList.add(readString);
}
} catch (Exception e) {
System.err.println("text2TokenList " + e.getMessage());
}
return tokenList;
}
/**
* Verify if the current character represents the end of a token.
*
* @param currentChar
* @param nextChar
* @return boolean
*/
public static boolean isEndOfToken(Character currentChar, Character nextChar) {
//Current char is NOT blank space and the next is blank space.
if (!currentChar.equals(" ".charAt(0)) && nextChar.equals(" ".charAt(0))) {
return true;
//Current char is blank space and the next is NOT blank space.
} else if (currentChar.equals(" ".charAt(0)) && !nextChar.equals(" ".charAt(0))) {
return true;
//Current char is NOT letter and NOT number and NOT blank space.
} else if (currentChar.toString().matches("\\W")
&& !currentChar.toString().matches("\\s")) {
return true;
//Current char is letter and the next is NOT letter.
} else if (Character.isLetter(currentChar)
&& !Character.isLetter(nextChar)) {
return true;
//Current char is number and the next is NOT number.
} else if (Character.isDigit(currentChar)
&& !Character.isDigit(nextChar)) {
return true;
} else {
return false;
}
}
/**
* Verify if current character represents the end of a token.
*
* @param currentChar
* @param nextChar
* @return boolean
*/
public static boolean isEndOfToken(Character currentChar, Character nextChar) {
// Current char is NOT blank space and the next is blank space.
if (!currentChar.equals(" ".charAt(0)) && nextChar.equals(" ".charAt(0))) {
return true;
// Current char is blank space and the next is NOT blank space.
} else if (currentChar.equals(" ".charAt(0)) && !nextChar.equals(" ".charAt(0))) {
return true;
// Current char is NOT letter and NOT number and NOT blank space.
} else if (currentChar.toString().matches("\\W") && !currentChar.toString().matches("\\s")) {
return true;
// Current char is letter and the next is NOT letter.
} else if (Character.isLetter(currentChar) && !Character.isLetter(nextChar)) {
return true;
// Current char is number and the next is NOT number.
} else if (Character.isDigit(currentChar) && !Character.isDigit(nextChar)) {
return true;
} else {
return false;
}
}
}
......@@ -45,8 +45,8 @@ public class TrieUtil {
* @return
* @throws Exception
*/
public NodeFT fillTrie(List<String> gazetteerList, NodeFT root, boolean useTransformation, String encoding,
String convertionClass, String convertionMethod) throws Exception {
public NodeFT buildTrie(List<String> gazetteerList, NodeFT root, boolean useTransformation, String encoding,
String transformationClass, String transformationMethod) throws Exception {
int deepestLevel = 0;
int numberChars = 0;
......@@ -56,55 +56,56 @@ public class TrieUtil {
int length = 0;
int levelInTrie = 0;
Integer nodeId = 0;
String convertedStr = new String();
List<InputString> tokenList = new ArrayList();
String convertedString = new String();
List<InputString> tokenList = new ArrayList();
List<String> convertedTokenList = new ArrayList();
String convertedToken = new String();
for (String s : gazetteerList) {
if (!useTransformation) {
convertedStr = s;
} else {// A transformation algorithm will be
convertedString = s;
} else {// Transformation algorithm will be
// used.
/*
* Utiliza um mtodo prprio para dividir as entradas em
* tokens.
* Entries are divided in tokens using an algorithm
* developed in this project. Do not use annie gazetteer.
*/
convertedTokenList.clear();
tokenList.clear();
// Carrega todos os tokens de uma entrada.
// Get tokens from an entrie.
tokenList = StringUtil.string2TokenList(s);
// Converte os tokens.
// Tokens are transformed.
for (InputString tempToken : tokenList) {
numberChars += tempToken.getStringBuffer().length();
convertedToken = StringUtil.convert(tempToken.getStringBuffer().toString(),
useTransformation, convertionClass, convertionMethod);
// Se o resultado da funo de converso for NULL ou
// nada, mantm a string original.
if (convertedToken != null && !convertedToken.equals("")) {
convertedTokenList.add(convertedToken);
// ONLY apply transformation on tokens that start with a
// letter.
if (Character.isLetter(tempToken.getStringBuffer().charAt(0))) {
convertedToken = StringUtil.stringTransformation(tempToken.getStringBuffer().toString(),
transformationClass, transformationMethod);
} else {
convertedTokenList.add(tempToken.getStringBuffer().toString());
convertedToken = tempToken.getStringBuffer().toString();
}
convertedTokenList.add(convertedToken);
}
//Junta os tokens de uma entrada em uma string nica.
// Create a single string using transformed tokens from an
// entrie.
StringBuffer stringBufferTemp = new StringBuffer();
for (String stringTemp : convertedTokenList) {
stringBufferTemp.append(stringTemp);
}
convertedStr = stringBufferTemp.toString();
convertedString = stringBufferTemp.toString();
}
currentlyNodeFT = root;
boolean foundChild = false;
currentPosition = 0;
length = convertedStr.length();
length = convertedString.length();
// Use each string character to buil the trie.
// Use each entrie character to buil the trie.
while (currentPosition < length) {
Character c = convertedStr.charAt(currentPosition);
Character c = convertedString.charAt(currentPosition);
levelInTrie = currentlyNodeFT.getLevel() + 1;
foundChild = false;
// If currently node doesn't has any child.
......@@ -154,7 +155,7 @@ public class TrieUtil {
}
/**
* Create a node
* Create a node
*
* @param c
* @param levelInTrie
......@@ -168,48 +169,8 @@ public class TrieUtil {
}
/**
* Identify and return entries from dictionary file.
*
* @param dictionaryBuffReader
* @return linesList
*/
public List<String> extractGazettersFromList(BufferedReader dictionaryBuffReader) {
String line;
logger.info("Linhas no metodo extractGazettersFromList: " + dictionaryBuffReader.lines().count());
try {
line = dictionaryBuffReader.readLine();
logger.info("Linha 1: " + line);
logger.info("Linha 1 tamanho: " + line.length());
List<String> linesList = new ArrayList<String>();
// Carrega em uma lista todas as linhas com gazetteers (abaixo do
// marcador
// "|ENTRIES|")
while (null != line) {
logger.info("Linha: " + line);
// if (line.equals("|ENTRIES|")) {
logger.info("Linha lida: " + line);
if (line.trim().compareTo("|ENTRIES|") == 0) {
logger.info("Passou no IF do compareTo");
line = dictionaryBuffReader.readLine();
while (line != null) {
linesList.add(line);
// Get next line.
line = dictionaryBuffReader.readLine();
}
} else {
line = dictionaryBuffReader.readLine();
}
}
return linesList;
} catch (Exception e) {
logger.error("extractGazettersFromList: " + e.toString());
return null;
}
}
/**
* Separate the entries from its features. Both are defined in the same line.
* Separate the entries from its features. Both are defined in the same
* line.
*
* @param configFileURL
* @param gazetterDelimiter
......@@ -217,7 +178,7 @@ public class TrieUtil {
* @param featuresSeparator
* @param encoding
*/
public HashMap<String, List<GazetteerFeature>> loadGazetteersAndFeatures(java.net.URL configFileURL,
public HashMap<String, List<GazetteerFeature>> loadEntriesAndFeatures(java.net.URL configFileURL,
String gazetterDelimiter, String featureNameValueSeparator, String featuresSeparator, String encoding)
throws Exception {
HashMap<String, List<GazetteerFeature>> gazetteerFeaturesMap = new HashMap<>();
......@@ -225,7 +186,8 @@ public class TrieUtil {
BufferedReader dictionaryBuffReader = new BomStrippingInputStreamReader((configFileURL).openStream(), encoding);
String line = dictionaryBuffReader.readLine();
// Put all entries in a list. Entries are every line bellow the tag "|ENTRIES|".
// Put all entries in a list. Entries are every line bellow the tag
// "|ENTRIES|".
while (null != line) {
if (line.equals("|ENTRIES|")) {
String linesArray[] = new String[2];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment