Skip to content

Commit 92eef39

Browse files
committed
added tokenization
1 parent 1b94779 commit 92eef39

File tree

7 files changed

+154
-75
lines changed

7 files changed

+154
-75
lines changed

Digital-Assignment/.idea/workspace.xml

Lines changed: 89 additions & 69 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Digital-Assignment/Readme.md

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,8 @@ Develop Java Programs implementing 10 new concepts/features/Topics (not in our s
1212
3. **Resolving Authentication failure**: Click on [this link](https://www.google.com/settings/security/lesssecureapps) and click on turn on radio button to allow users to send mail from unknown location.
1313

1414
## Language Detection using OpenNLP
15-
* Code - [LanguageDetectorMain.java](https://github.com/jacobjohn2016/Java-Programming/blob/master/Digital-Assignment/src/languagedetector/LanguageDetectorMain.java), [LanguageMapper.java](https://github.com/jacobjohn2016/Java-Programming/blob/master/Digital-Assignment/src/languagedetector/LanguageMapper.java)
15+
* Code - [LanguageDetectorMain.java](https://github.com/jacobjohn2016/Java-Programming/blob/master/Digital-Assignment/src/OpenNLP/languagedetector/LanguageDetectorMain.java), [LanguageMapper.java](https://github.com/jacobjohn2016/Java-Programming/blob/master/Digital-Assignment/src/OpenNLP/languagedetector/LanguageMapper.java)
1616
* [Source](https://github.com/Ruthwik/Language-Detection)
1717
* [`.jar` files](https://github.com/jacobjohn2016/Java-Programming/tree/master/Digital-Assignment/apache-opennlp-1.9.1/lib)
1818
* The Apache OpenNLP library is a machine learning based toolkit for the processing of natural language text. It supports the most common NLP tasks, such as language detection, tokenization, sentence segmentation, part-of-speech tagging, named entity extraction, chunking, parsing and coreference resolution.
1919
* This model is trained for and works well with longer texts that have at least 2 sentences or more from the same language.
20-
21-
## Face Detection using OpenCV
22-
* [Code](https://github.com/jacobjohn2016/Java-Programming/blob/master/Digital-Assignment/src/FaceDetector.java)
23-
* [Source](https://www.geeksforgeeks.org/image-processing-java-set-9-face-detection/)
24-
* [`.jar` files](https://github.com/jacobjohn2016/Java-Programming/blob/master/Digital-Assignment/opencv-3.2.0-1.jar)
Binary file not shown.
430 KB
Binary file not shown.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package OpenNLP.Tokenization;
2+
3+
import opennlp.tools.tokenize.SimpleTokenizer;
4+
import opennlp.tools.tokenize.WhitespaceTokenizer;
5+
6+
public class SentenceTokenization {
7+
public static void main(String[] args) {
8+
String sentence = "Hi. How are you? Welcome to Tutorialspoint. "
9+
+ "We provide free tutorials on various technologies";
10+
11+
//Instantiating SimpleTokenizer class
12+
SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
13+
14+
//Tokenizing the given sentence
15+
String tokens[] = simpleTokenizer.tokenize(sentence);
16+
17+
//Printing the tokens
18+
for(String token :tokens){
19+
System.out.println(token);
20+
}
21+
22+
//Instantiating whitespaceTokenizer class
23+
WhitespaceTokenizer whitespaceTokenizer = WhitespaceTokenizer.INSTANCE;
24+
25+
//Tokenizing the given paragraph
26+
String whitetokens[] = whitespaceTokenizer.tokenize(sentence);
27+
28+
//Printing the tokens
29+
for(String token : whitetokens)
30+
System.out.println(token);
31+
}
32+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import java.io.FileInputStream;
2+
import java.io.InputStream;
3+
import opennlp.tools.tokenize.TokenizerME;
4+
import opennlp.tools.tokenize.TokenizerModel;
5+
import opennlp.tools.util.Span;
6+
7+
public class TokenizerMEProbs {
8+
9+
public static void main(String args[]) throws Exception{
10+
String sent = "Hello John how are you welcome to Tutorialspoint";
11+
12+
//Loading the Tokenizer model
13+
InputStream inputStream = new FileInputStream("resources/en-token.bin");
14+
TokenizerModel tokenModel = new TokenizerModel(inputStream);
15+
16+
//Instantiating the TokenizerME class
17+
TokenizerME tokenizer = new TokenizerME(tokenModel);
18+
19+
//Retrieving the positions of the tokens
20+
Span tokens[] = tokenizer.tokenizePos(sent);
21+
22+
//Getting the probabilities of the recent calls to tokenizePos() method
23+
double[] probs = tokenizer.getTokenProbabilities();
24+
25+
//Printing the spans of tokens
26+
for(Span token : tokens)
27+
System.out.println(token +" "+sent.substring(token.getStart(), token.getEnd()));
28+
System.out.println(" ");
29+
for(int i = 0; i<probs.length; i++)
30+
System.out.println(probs[i]);
31+
}
32+
}

0 commit comments

Comments
 (0)