|
|
|
@ -6,8 +6,7 @@ import io.vavr.Tuple2; |
|
|
|
|
import io.vavr.collection.*; |
|
|
|
|
import org.apache.commons.io.IOUtils; |
|
|
|
|
|
|
|
|
|
import java.io.IOException; |
|
|
|
|
import java.io.InputStream; |
|
|
|
|
import java.io.*; |
|
|
|
|
import java.nio.charset.Charset; |
|
|
|
|
import java.util.Comparator; |
|
|
|
|
import java.util.Locale; |
|
|
|
@ -25,14 +24,19 @@ public class Analysis { |
|
|
|
|
private static final String punctuation = "!,.;:"; |
|
|
|
|
|
|
|
|
|
public static void main(String... args) throws IOException { |
|
|
|
|
String in = IOUtils.resourceToString("/corpus.txt", Charset.defaultCharset()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try(InputStream is = Analysis.class.getResourceAsStream("/corpus.txt")) { |
|
|
|
|
|
|
|
|
|
HashSet<Character> keyChars = HashSet.ofAll(keys.toCharArray()); |
|
|
|
|
HashSet<Character> punctuationChars = HashSet.ofAll(punctuation.toCharArray()); |
|
|
|
|
|
|
|
|
|
Map<String, Integer> rawTokens = Stream.ofAll(in.chars().boxed()) |
|
|
|
|
.filter(c -> !punctuationChars.contains((char) c.intValue())) |
|
|
|
|
Map<String, Integer> rawTokens = Stream.ofAll(new BufferedReader(new InputStreamReader(is)) |
|
|
|
|
.lines() |
|
|
|
|
.parallel() |
|
|
|
|
.flatMap(s -> s.chars().boxed()) |
|
|
|
|
.filter(c -> !punctuationChars.contains((char) c.intValue())) |
|
|
|
|
) |
|
|
|
|
.foldLeft( |
|
|
|
|
Tuple("", HashMap.<String, Integer>empty()), |
|
|
|
|
(tup, integer) -> Character.isWhitespace(integer) ? tup.map(s -> "", map -> map.put(tup._1().toLowerCase(Locale.ROOT), 1, Integer::sum)) : tup.map1(s -> s + ((char) integer.intValue())) |
|
|
|
@ -44,16 +48,26 @@ public class Analysis { |
|
|
|
|
|
|
|
|
|
System.out.println("Cleaned tokens: " + cleaned.size()); |
|
|
|
|
|
|
|
|
|
System.out.println(cleaned); |
|
|
|
|
Seq<Integer> freq = cleaned.map(Tuple2::_2); |
|
|
|
|
int max = freq.max().getOrElse(0); |
|
|
|
|
|
|
|
|
|
System.out.println("Max frequency: " + max); |
|
|
|
|
int median = freq.get(freq.size() / 2); |
|
|
|
|
System.out.println("Median frequency: " + median); |
|
|
|
|
int cutoff = Math.max(median, 5); |
|
|
|
|
int maxLength = 15; |
|
|
|
|
|
|
|
|
|
Set<String> tokensWithPrefixes = cleaned.keySet().flatMap(Analysis::prefix); |
|
|
|
|
Map<String, Integer> cut = cleaned.filterValues(i -> i > cutoff).filterKeys(s -> s.length() <= maxLength); |
|
|
|
|
|
|
|
|
|
System.out.println(tokensWithPrefixes); |
|
|
|
|
System.out.println("Common tokens: " + cut.size()); |
|
|
|
|
|
|
|
|
|
Map<String, List<String>> probabilities = tokensWithPrefixes.toMap(Function.identity(), token -> cleaned |
|
|
|
|
Set<String> tokensWithPrefixes = cut.keySet().flatMap(Analysis::prefix); |
|
|
|
|
|
|
|
|
|
System.out.println("Tokens and prefixes: " + tokensWithPrefixes.size()); |
|
|
|
|
|
|
|
|
|
Map<String, List<String>> probabilities = tokensWithPrefixes.toMap(Function.identity(), token -> cut |
|
|
|
|
.filterKeys(key -> key.startsWith(token)) |
|
|
|
|
.map((key, value) -> Tuple(key.substring(token.length()), value)) |
|
|
|
|
.toList() |
|
|
|
|
.flatMap(t -> prefix(t._1()).map(it -> Tuple(t._2(), it))) |
|
|
|
|
.foldLeft(HashMap.<String, Integer>empty(), (run, k) -> run.put(k._2(), k._1(), Integer::sum)) |
|
|
|
|
.toList() |
|
|
|
@ -66,14 +80,23 @@ public class Analysis { |
|
|
|
|
._2() |
|
|
|
|
.map(Tuple2::_1)); |
|
|
|
|
|
|
|
|
|
System.out.println(probabilities); |
|
|
|
|
System.out.println("Generated probabilities. Saving file..."); |
|
|
|
|
|
|
|
|
|
StringBuilder data = new StringBuilder(); |
|
|
|
|
|
|
|
|
|
probabilities.forEach((key, values) -> { |
|
|
|
|
values.take(4).forEach(it -> data.append(key).append(":").append(it).append('\n')); |
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
IOUtils.write(data.toString(), new FileOutputStream("./corpus.8vim"), Charset.defaultCharset()); |
|
|
|
|
System.out.println("Saved."); |
|
|
|
|
|
|
|
|
|
Scanner s = new Scanner(System.in); |
|
|
|
|
|
|
|
|
|
while (true) { |
|
|
|
|
System.out.print("Start: "); |
|
|
|
|
String start = s.nextLine(); |
|
|
|
|
System.out.println(probabilities.get(start)); |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|