begin work on corpus analysis

master
dfsek 1 year ago
parent 4ccddc22af
commit ce04181617
  1. 1
      build.gradle
  2. 11
      corpus/build.gradle
  3. 50
      corpus/src/main/java/com/dfsek/corpus/Analysis.java
  4. 6
      corpus/src/main/resources/corpus.txt
  5. 1
      settings.gradle

@ -19,6 +19,7 @@ allprojects {
mavenCentral()
google()
maven { url "https://jitpack.io" }
maven { url "https://oss.sonatype.org/content/repositories/snapshots" }
}
task checkstyle(type: Checkstyle) {
showViolations = true

@ -0,0 +1,11 @@
apply plugin: 'java'
dependencies {
implementation 'io.vavr:vavr:0.10.4'
implementation 'commons-io:commons-io:2.11.0'
}
java {
sourceCompatibility = JavaVersion.VERSION_11
}

@ -0,0 +1,50 @@
package com.dfsek.corpus;
import io.vavr.API;
import io.vavr.collection.*;
import org.apache.commons.io.IOUtils;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Locale;
import java.util.Scanner;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
import static io.vavr.API.*;
public class Analysis {
private static final String keys = "abcdefghijklmnopqrstuvwxyz";
private static final String punctuation = "!,.;:";
public static void main(String... args) throws IOException {
String in = IOUtils.resourceToString("/corpus.txt", Charset.defaultCharset());
HashSet<Character> keyChars = HashSet.ofAll(keys.toCharArray());
HashSet<Character> punctuationChars = HashSet.ofAll(punctuation.toCharArray());
Map<String, Integer> rawTokens = Stream.ofAll(in.chars().boxed())
.filter(c -> !punctuationChars.contains((char) c.intValue()))
.foldLeft(
Tuple("", HashMap.<String, Integer>empty()),
(tup, integer) -> Character.isWhitespace(integer) ? tup.map(s -> "", map -> map.put(tup._1().toLowerCase(Locale.ROOT), 1, Integer::sum)) : tup.map1(s -> s + ((char) integer.intValue()))
)._2();
System.out.println("Imported " + rawTokens.size() + " raw tokens.");
Map<String, Integer> cleaned = rawTokens.filterKeys(token -> token.chars().allMatch(i -> keyChars.contains((char) i)));
System.out.println("Cleaned tokens: " + cleaned.size());
System.out.println(cleaned);
Scanner s = new Scanner(System.in);
while (true) {
System.out.print("Start: ");
String start = s.nextLine();
System.out.println();
}
}
}

@ -0,0 +1,6 @@
aaaa
abaa
abba...
abbb4
abbb
abab

@ -1 +1,2 @@
include ':8vim'
include ':corpus'
Loading…
Cancel
Save