parent
4ccddc22af
commit
ce04181617
@ -0,0 +1,11 @@ |
||||
apply plugin: 'java' |
||||
|
||||
|
||||
dependencies { |
||||
implementation 'io.vavr:vavr:0.10.4' |
||||
implementation 'commons-io:commons-io:2.11.0' |
||||
} |
||||
|
||||
java { |
||||
sourceCompatibility = JavaVersion.VERSION_11 |
||||
} |
@ -0,0 +1,50 @@ |
||||
package com.dfsek.corpus; |
||||
|
||||
import io.vavr.API; |
||||
import io.vavr.collection.*; |
||||
import org.apache.commons.io.IOUtils; |
||||
|
||||
import java.io.IOException; |
||||
import java.nio.charset.Charset; |
||||
import java.util.Locale; |
||||
import java.util.Scanner; |
||||
import java.util.function.BiFunction; |
||||
import java.util.stream.Collectors; |
||||
|
||||
import static io.vavr.API.*; |
||||
|
||||
public class Analysis { |
||||
private static final String keys = "abcdefghijklmnopqrstuvwxyz"; |
||||
private static final String punctuation = "!,.;:"; |
||||
public static void main(String... args) throws IOException { |
||||
String in = IOUtils.resourceToString("/corpus.txt", Charset.defaultCharset()); |
||||
|
||||
HashSet<Character> keyChars = HashSet.ofAll(keys.toCharArray()); |
||||
HashSet<Character> punctuationChars = HashSet.ofAll(punctuation.toCharArray()); |
||||
|
||||
Map<String, Integer> rawTokens = Stream.ofAll(in.chars().boxed()) |
||||
.filter(c -> !punctuationChars.contains((char) c.intValue())) |
||||
.foldLeft( |
||||
Tuple("", HashMap.<String, Integer>empty()), |
||||
(tup, integer) -> Character.isWhitespace(integer) ? tup.map(s -> "", map -> map.put(tup._1().toLowerCase(Locale.ROOT), 1, Integer::sum)) : tup.map1(s -> s + ((char) integer.intValue())) |
||||
)._2(); |
||||
|
||||
System.out.println("Imported " + rawTokens.size() + " raw tokens."); |
||||
|
||||
Map<String, Integer> cleaned = rawTokens.filterKeys(token -> token.chars().allMatch(i -> keyChars.contains((char) i))); |
||||
|
||||
System.out.println("Cleaned tokens: " + cleaned.size()); |
||||
|
||||
|
||||
|
||||
System.out.println(cleaned); |
||||
|
||||
Scanner s = new Scanner(System.in); |
||||
while (true) { |
||||
System.out.print("Start: "); |
||||
String start = s.nextLine(); |
||||
System.out.println(); |
||||
|
||||
} |
||||
} |
||||
} |
@ -0,0 +1,6 @@ |
||||
aaaa |
||||
abaa |
||||
abba... |
||||
abbb4 |
||||
abbb |
||||
abab |
@ -1 +1,2 @@ |
||||
include ':8vim' |
||||
include ':corpus' |
Loading…
Reference in new issue