Added some corpus statistics predicates.
authorYeGoblynQueenne@splinter <ep50@uni.brighton.ac.uk>
Thu, 18 Aug 2016 18:28:27 +0000 (20:28 +0200)
committerYeGoblynQueenne@splinter <ep50@uni.brighton.ac.uk>
Thu, 18 Aug 2016 18:28:27 +0000 (20:28 +0200)
* Some of the new predicates in corpus_utilities actually checks the
  currently configured corpus instead of taking a corpus (ie, a set of
  examples) as an argument. The latter is probably a pattern I want to
  use. Probably. Passing the current examples corpus and then
  pretty-printing should be left to project_utilities predicates (these
  already handle the pretty-printing for the latest stats preds).

lib/corpus_utilities.pl
lib/project_utilities.pl

index 7314d3d..6b59708 100644 (file)
@@ -3,6 +3,12 @@
                           ,tokenise_corpus/0
                           ,tokenise_file/2
                           ,tokenise_line/7
+                          ,k_most_frequent_n_grams/4
+                          ,n_most_frequent_words/3
+                          ,n_least_frequent_sentences/3
+                          ,n_least_frequent_sentences/2
+                          ,n_most_frequent_sentences/2
+                          ,longest_sentence/2
                           ,average_sentence_length/2
                           ,max_sentence_length/2
                           ,min_sentence_length/2
@@ -404,6 +410,123 @@ write_terms(Stream, Terms):-
 
 
 
+%!     k_most_frequent_n_grams(+Corpus,+N,+K,-K_NGrams) is det.
+%
+%      Find the K most frequent N-Grams in Corpus.
+%
+%      K_Ngrams is a list of key-value pairs, where keys are N-Grams
+%      and values the number of times they're found in Corpus. The K
+%      pairs with the highest value are bound to K_Ngrams.
+%
+k_most_frequent_n_grams(Cs, N, K, Ns):-
+       writeln('Getting k n-grams'),
+       corpus_n_grams(Cs,N, cleanup(of_course),Gs)
+       ,writeln('Sorting k n-grams')
+       ,sort(2,@>=,Gs,Gs_)
+       ,writeln('Finding k most frequent n-grams')
+       ,findall(Ng-C
+               ,(between(1,K,I)
+                ,nth1(I,Gs_,Ng-C)
+                )
+               ,Ns).
+
+
+
+%!     n_most_frequent_words(+Corpus,+N,-Words) is det.
+%
+%      Find the N most frequent Words in Corpus.
+%
+%      Words is a list of key-value pairs where keys are words and
+%      values their counts in Corpus. The N pairs with the highest
+%      values are bound to Words.
+%
+n_most_frequent_words(Cs,N,Ws):-
+       corpus_token_types(Cs,cleanup(yeah),Us)
+       ,sort(2,@>=,Us,Us_)
+       ,findall(W-C
+               ,(between(1,N,I)
+                ,nth1(I,Us_,W-C)
+                )
+               ,Ws).
+
+
+
+%!     n_least_frequent_sentences(+N,+K,-Sentences) is det.
+%
+%      Find the N most frequent sentences that appear at least K times
+%      in the corpus.
+%
+n_least_frequent_sentences(N, K, Fs):-
+       sentence_mle_term(T)
+       ,load_sentence_frequencies(Ss)
+       ,length(Ss, M)
+       ,once(findnsols(N
+                 ,Ts-F
+                 ,(member(S,Ss)
+                  ,S =.. [T,Ts,F]
+                  % Hack to avoid counting sentences with reminder text
+                  % Remove and fix tokenisation insted.
+                  ,\+ member('(',Ts)
+                  % P is the actual number of times Ts appears in Ss
+                  ,P is F * M
+                  ,P >= K
+                  )
+                 ,Fs)).
+
+
+
+%!     n_least_frequent_sentences(+N,-Sentences) is det.
+%
+%      Find the N least frequent sentences in the current corpus.
+%
+n_least_frequent_sentences(N, Fs):-
+       sentence_mle_term(T)
+       ,load_sentence_frequencies(Ss)
+       ,sort(2,@=<,Ss,Ss_s)
+       ,findall(Ts-F
+               ,(between(1,N,I)
+                ,nth1(I,Ss_s,S)
+                ,S =.. [T,Ts,F]
+                )
+               ,Fs).
+
+
+
+%!     n_most_frequent_sentences(+N,-Sentences) is det.
+%
+%      Find the N most frequent sentences in the current corpus.
+%
+n_most_frequent_sentences(N,Fs):-
+       sentence_mle_term(T)
+       ,load_sentence_frequencies(Ss)
+       ,sort(2,@>=,Ss,Ss_s)
+       ,findall(Ts-F
+               ,(between(1,N,I)
+                ,nth1(I,Ss_s,S)
+                ,S =.. [T,Ts,F]
+                ,\+ member('(',Ts)
+                )
+               ,Fs).
+
+
+
+%!     longest_sentence(+Corpus,-Sentence) is det.
+%
+%      Find the longest Sentence in Corpus.
+%
+%      Sentence is a term Length-Sentence, where Length the number of
+%      tokens in Sentence.
+%
+longest_sentence(Cs,LS):-
+       findall(L-S
+              ,(member(S,Cs)
+               ,length(S,L)
+               )
+              ,Ls)
+       ,sort(1,@>,Ls,[LS|_]).
+
+
+
 %!     average_sentence_length(+Corpus,-Average_length) is det.
 %
 %      Calculate the mean length of sentences in Corpus.
@@ -727,6 +850,7 @@ write_fake_n_gram_counts:-
        ,assert(word_count(E, Xs_c)).
 
 
+
 %!     corpus_token_types(+Corpus,+Cleanup,-Token_types) is det.
 %
 %      Count uni-grams, a.k.a. token types, a.k.a. word types in
@@ -797,10 +921,12 @@ corpus_n_grams(Cs, N, cleanup(B), Gs):-
                 ,append(C,[E],C_)
                 )
                ,Cs_)
+       ,writeln('Counting ngrams...')
        % Count n-grams
         ,forall(member(Ss,Cs_)
               ,sentence_n_grams(Ss,N)
               )
+       ,writeln('Counted n-grams; cleaning up')
        % Report their counts.
        ,findall(Ws-C
                ,n_gram_count(Ws,C)
@@ -809,7 +935,9 @@ corpus_n_grams(Cs, N, cleanup(B), Gs):-
        ,(   B \= false
         ->  cleanup_n_gram_counts
         ;   true
-        ).
+        )
+       ,writeln('Cleaned up')
+       .
 
 
 %!     sentence_n_grams(+Sentence, +N) is det.
index da172dc..6879e71 100644 (file)
@@ -1,4 +1,9 @@
-\feff:-module(utilities, [let_configuration_option/2
+\feff:-module(utilities, [pretty_print_k_most_frequent_n_grams/2
+                   ,pretty_print_n_most_frequent_words/1
+                   ,pretty_print_longest_sentence/0
+                   ,pretty_print_n_least_frequent_sentences/2
+                   ,pretty_print_n_most_frequent_sentences/1
+                   ,let_configuration_option/2
                    ,k_splits/3
                    ,k_samples/3
                    ,k_samples/4
 */
 
 
+%!     pretty_print_k_most_frequent_n_grams(+K,+N) is det.
+%
+%      Compute and print out the K most frequent N-grams in the current
+%      corpus.
+%
+pretty_print_k_most_frequent_n_grams(N,K):-
+       examples_corpus(Cs)
+       ,corpus_utilities:k_most_frequent_n_grams(Cs,N,K,Ws)
+       ,forall(member(Ng-C, Ws)
+              ,(atomic_list_concat(Ng,' ',Ng_)
+               ,format('~w ~w~n',[C,Ng_])
+               )
+              ).
+
+
+
+%!     pretty_print_n_most_frequent_words(+N) is det.
+%
+%      Print a list of the N most frequent words in the current corpus.
+%
+pretty_print_n_most_frequent_words(N):-
+       examples_corpus(Cs)
+       ,corpus_utilities:n_most_frequent_words(Cs,N,Ws)
+       ,forall(member(Ng-C, Ws)
+              ,(atomic_list_concat([Ng],' ',Ng_)
+               ,format('~w ~w~n',[C,Ng_])
+               )
+              ).
+
+
+
+%!     pretty_print_longest_sentence is det.
+%
+%      Pretty-print the longest sentence in the current corpus.
+%
+pretty_print_longest_sentence:-
+       examples_corpus(Cs)
+       ,longest_sentence(Cs,L-S)
+       ,atomic_list_concat(S,' ', S_)
+       ,format('~w ~w~n',[S_,L]).
+
+
+
+%!     pretty_print_n_least_frequent_sentences(+N,+K) is det.
+%
+%      Find the N least frequent sentences appearing in the current
+%      corpus at least K times and print them pretty.
+%
+pretty_print_n_least_frequent_sentences(N,K):-
+       n_least_frequent_sentences(N,K,Fs)
+       ,forall(member(F-S,Fs)
+              ,(atomic_list_concat(F, ' ', F_)
+              ,format('~6f ~w~n',[S,F_]))
+              )        .
+
+
+%!     pretty_print_n_most_frequent_sentences(?N) is det.
+%
+%      Self-explanatory.
+%
+%      Sentences are taken from the currently configured examples
+%      corpus.
+%
+pretty_print_n_most_frequent_sentences(N):-
+       n_most_frequent_sentences(N,Fs)
+       ,forall(member(F-S,Fs)
+              ,(atomic_list_concat(F, ' ', F_)
+              ,format('~6f ~w~n',[S,F_]))
+              ).
+
+
 %!     let_configuration_option(+Option,+Values) is det.
 %
 %      Set a configuration Option to a list of Values or query Option