Commit cf10b113 authored by JR's avatar JR

Changes for tokenizer in trie building.

parent 97e88225
...@@ -406,7 +406,7 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action ...@@ -406,7 +406,7 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
protected void loadDataFromDictionary() throws Exception { protected void loadDataFromDictionary() throws Exception {
if (getConfigFileURL() != null) { if (getConfigFileURL() != null) {
List<String> gazetteerEntriesList = new ArrayList<>(); // List<String> gazetteerEntriesList = new ArrayList<>();
logger.info("Creating trie from " + getConfigFileURL()); logger.info("Creating trie from " + getConfigFileURL());
BufferedReader dictionaryBuffReader = new BomStrippingInputStreamReader((configFileURL).openStream(), BufferedReader dictionaryBuffReader = new BomStrippingInputStreamReader((configFileURL).openStream(),
encoding); encoding);
...@@ -503,7 +503,7 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action ...@@ -503,7 +503,7 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
this.gazetteerFeaturesMap = makeTrie.loadEntriesAndFeatures(configFileURL, this.entrieDelimiter, this.gazetteerFeaturesMap = makeTrie.loadEntriesAndFeatures(configFileURL, this.entrieDelimiter,
this.featureNameValueSeparator, this.featuresSeparator, encoding); this.featureNameValueSeparator, this.featuresSeparator, encoding);
HashMap<Integer, Integer> tamanhoLinhaMap = new HashMap<>(); HashMap<Integer, Integer> entrieLineLengthMap = new HashMap<>();
Integer linhaCount = 0; Integer linhaCount = 0;
File configFile = gate.util.Files.fileFromURL(configFileURL); File configFile = gate.util.Files.fileFromURL(configFileURL);
String entriesFileName = configFile.getAbsolutePath().replaceAll("\\.txt$", "\\_GAZETTEERS.txt"); String entriesFileName = configFile.getAbsolutePath().replaceAll("\\.txt$", "\\_GAZETTEERS.txt");
...@@ -512,16 +512,16 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action ...@@ -512,16 +512,16 @@ public class InexactGazetteer extends AbstractLanguageAnalyser implements Action
linhaCount++; linhaCount++;
String entrieNormalized = element.getKey().trim(); String entrieNormalized = element.getKey().trim();
entrieNormalized = ws_pattern.matcher(entrieNormalized).replaceAll(" "); entrieNormalized = ws_pattern.matcher(entrieNormalized).replaceAll(" ");
gazetteerEntriesList.add(entrieNormalized); // gazetteerEntriesList.add(entrieNormalized);
tamanhoLinhaMap.put(linhaCount, entrieNormalized.length()); entrieLineLengthMap.put(linhaCount, entrieNormalized.length());
writer.println(entrieNormalized); writer.println(entrieNormalized);
} }
writer.close(); writer.close();
// Build trie // Build trie
this.trie = new NodeFT(null, Boolean.FALSE, new ArrayList<>(), 0); this.trie = new NodeFT(null, Boolean.FALSE, new ArrayList<>(), 0);
this.trie = new TrieUtil().buildTrie(gazetteerEntriesList, trie, this.useTransformation, encoding, this.trie = new TrieUtil().buildTrie(trie, this.useTransformation, encoding,
this.transformationClass, this.transformationMethod, entriesFileName, tamanhoLinhaMap, this.transformationClass, this.transformationMethod, entriesFileName, entrieLineLengthMap,
this.tokeniserGatePR); this.tokeniserGatePR);
this.deepestLevel = trie.getLevel(); this.deepestLevel = trie.getLevel();
this.trie.setLevel(0); this.trie.setLevel(0);
......
...@@ -72,7 +72,7 @@ public class TrieUtil { ...@@ -72,7 +72,7 @@ public class TrieUtil {
* @return NodeFT * @return NodeFT
* @throws Exception * @throws Exception
*/ */
public NodeFT buildTrie(List<String> gazetteerList, NodeFT root, boolean useTransformation, String encoding, public NodeFT buildTrie(NodeFT root, boolean useTransformation, String encoding,
String transformationClass, String transformationMethod, String entriesFilePath, String transformationClass, String transformationMethod, String entriesFilePath,
HashMap<Integer, Integer> tamanhoLinhaMap, String tokeniserPR) throws Exception { HashMap<Integer, Integer> tamanhoLinhaMap, String tokeniserPR) throws Exception {
int deepestLevel = 0; int deepestLevel = 0;
...@@ -208,7 +208,7 @@ public class TrieUtil { ...@@ -208,7 +208,7 @@ public class TrieUtil {
} }
root.setLevel(deepestLevel); root.setLevel(deepestLevel);
logger.info("Number of entries: " + gazetteerList.size()); logger.info("Number of entries: " + lineCount);
logger.info("Number of characters: " + numberChars); logger.info("Number of characters: " + numberChars);
logger.info("Number of nodes: " + totalNos); logger.info("Number of nodes: " + totalNos);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment