Commit cf10b113 authored by JR's avatar JR

Changes for tokenizer in trie building.

parent 97e88225
......@@ -406,7 +406,7 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
protected void loadDataFromDictionary() throws Exception {
if (getConfigFileURL() != null) {
List<String> gazetteerEntriesList = new ArrayList<>();
// List<String> gazetteerEntriesList = new ArrayList<>();
logger.info("Creating trie from " + getConfigFileURL());
BufferedReader dictionaryBuffReader = new BomStrippingInputStreamReader((configFileURL).openStream(),
encoding);
......@@ -503,7 +503,7 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
this.gazetteerFeaturesMap = makeTrie.loadEntriesAndFeatures(configFileURL, this.entrieDelimiter,
this.featureNameValueSeparator, this.featuresSeparator, encoding);
HashMap<Integer, Integer> tamanhoLinhaMap = new HashMap<>();
HashMap<Integer, Integer> entrieLineLengthMap = new HashMap<>();
Integer linhaCount = 0;
File configFile = gate.util.Files.fileFromURL(configFileURL);
String entriesFileName = configFile.getAbsolutePath().replaceAll("\\.txt$", "\\_GAZETTEERS.txt");
......@@ -512,16 +512,16 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
linhaCount++;
String entrieNormalized = element.getKey().trim();
entrieNormalized = ws_pattern.matcher(entrieNormalized).replaceAll(" ");
gazetteerEntriesList.add(entrieNormalized);
tamanhoLinhaMap.put(linhaCount, entrieNormalized.length());
// gazetteerEntriesList.add(entrieNormalized);
entrieLineLengthMap.put(linhaCount, entrieNormalized.length());
writer.println(entrieNormalized);
}
writer.close();
// Build trie
this.trie = new NodeFT(null, Boolean.FALSE, new ArrayList<>(), 0);
this.trie = new TrieUtil().buildTrie(gazetteerEntriesList, trie, this.useTransformation, encoding,
this.transformationClass, this.transformationMethod, entriesFileName, tamanhoLinhaMap,
this.trie = new TrieUtil().buildTrie(trie, this.useTransformation, encoding,
this.transformationClass, this.transformationMethod, entriesFileName, entrieLineLengthMap,
this.tokeniserGatePR);
this.deepestLevel = trie.getLevel();
this.trie.setLevel(0);
......
......@@ -72,7 +72,7 @@ public class TrieUtil {
* @return NodeFT
* @throws Exception
*/
public NodeFT buildTrie(List<String> gazetteerList, NodeFT root, boolean useTransformation, String encoding,
public NodeFT buildTrie(NodeFT root, boolean useTransformation, String encoding,
String transformationClass, String transformationMethod, String entriesFilePath,
HashMap<Integer, Integer> tamanhoLinhaMap, String tokeniserPR) throws Exception {
int deepestLevel = 0;
......@@ -208,7 +208,7 @@ public class TrieUtil {
}
root.setLevel(deepestLevel);
logger.info("Number of entries: " + gazetteerList.size());
logger.info("Number of entries: " + lineCount);
logger.info("Number of characters: " + numberChars);
logger.info("Number of nodes: " + totalNos);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment