Source code for pyndl.count

"""
pyndl.count
-----------

*pyndl.count* provides functions in order to count

* words and symbols in a corpus file
* cues and outcomes in an event file

"""
# pylint: disable=redefined-outer-name, invalid-name

from collections import Counter, namedtuple
import gzip
import itertools
import multiprocessing
import sys
import warnings

from pyndl import io


CuesOutcomes = namedtuple('CuesOutcomes', 'n_events, cues, outcomes')
WordsSymbols = namedtuple('WordsSymbols', 'words, symbols')


def _job_cues_outcomes(event_file_name, start, step, verbose=False):
    """
    Counts cues and outcomes for every ``step`` event starting from
    ``start`` event.

    Returns
    -------
    (nn, cues, outcomes) : (int, collections.Counter, collections.Counter)

    """
    cues = Counter()
    outcomes = Counter()
    nn = -1  # in case the for loop never gets called and 1 gets added in the end
    events = io.events_from_file(event_file_name, start=start, step=step)
    for nn, (cue_list, outcome_list) in enumerate(events):
        for cue in cue_list:
            cues[cue] += 1
        for outcome in outcome_list:
            outcomes[outcome] += 1
        if verbose and nn % 100000 == 0:
            print('.', end='')
            sys.stdout.flush()
    return (nn + 1, cues, outcomes)


[docs]def cues_outcomes(event_file_name,
                  *, n_jobs=2, number_of_processes=None, verbose=False):
    """
    Counts cues and outcomes in event_file_name using n_jobs
    processes.

    Returns
    -------
    (n_events, cues, outcomes) : (int, collections.Counter, collections.Counter)

    """
    if number_of_processes is not None:
        warnings.warn("Parameter `number_of_processes` is renamed to `n_jobs`. The old name "
                      "will stop working with v0.9.0.",
                      DeprecationWarning, stacklevel=2)
        n_jobs = number_of_processes
    with multiprocessing.Pool(n_jobs) as pool:
        step = n_jobs
        results = pool.starmap(_job_cues_outcomes,
                               ((event_file_name,
                                 start,
                                 step,
                                 verbose)
                                for start in range(n_jobs)))
        n_events = 0
        cues = Counter()
        outcomes = Counter()
        for nn, cues_process, outcomes_process in results:
            n_events += nn
            cues += cues_process
            outcomes += outcomes_process

    if verbose:
        print('\n...counting done.')

    return CuesOutcomes(n_events, cues, outcomes)


def _job_words_symbols(corpus_file_name, start, step, lower_case=False,
                       verbose=False):
    """
    Counts the words and symbols for every ``step`` line starting from
    ``start`` line.

    It is assumed that words are separated by at least one space or by a new
    line character.

    .. note::

        Punctuation characters, brackets and some other characters are stripped
        from the word and are not counted.

    Returns
    -------
    (words, symbols) : (collections.Counter, collections.Counter)

    """
    words = Counter()
    symbols = Counter()
    with open(corpus_file_name, 'r') as dfile:
        for nn, line in enumerate(itertools.islice(dfile, start, None, step)):
            for word in line.split():  # splits the string on all whitespace
                word = word.strip()
                word = word.strip('!?,.:;/"\'()^@*~')
                if lower_case:
                    word = word.lower()
                if not word:
                    continue
                words[word] += 1
                symbols += Counter(word)
            if verbose and nn % 100000 == 0:
                print('.', end='')
                sys.stdout.flush()
    return (words, symbols)


[docs]def words_symbols(corpus_file_name,
                  *, n_jobs=2, number_of_processes=None, lower_case=False, verbose=False):
    """
    Counts words and symbols in corpus_file_name using n_jobs
    processes.

    Returns
    -------
    (words, symbols) : (collections.Counter, collections.Counter)

    """
    if number_of_processes is not None:
        warnings.warn("Parameter `number_of_processes` is renamed to `n_jobs`. The old name "
                      "will stop working with v0.9.0.",
                      DeprecationWarning, stacklevel=2)
        n_jobs = number_of_processes
    with multiprocessing.Pool(n_jobs) as pool:
        step = n_jobs
        results = pool.starmap(_job_words_symbols, ((corpus_file_name,
                                                     start,
                                                     step,
                                                     lower_case,
                                                     verbose)
                                                    for start in
                                                    range(n_jobs)))
        words = Counter()
        symbols = Counter()
        for words_process, symbols_process in results:
            words += words_process
            symbols += symbols_process

    if verbose:
        print('\n...counting done.')

    return WordsSymbols(words, symbols)


[docs]def save_counter(counter, filename, *, header='key\tfreq\n'):
    """
    Saves a counter object into a tab delimitered text file.

    """
    with open(filename, 'wt') as dfile:
        dfile.write(header)
        for key, count in counter.most_common():
            dfile.write('{key}\t{count}\n'.format(key=key, count=count))


[docs]def load_counter(filename):
    """
    Loads a counter out of a tab delimitered text file.

    """
    with open(filename, 'rt') as dfile:
        # skip header
        dfile.readline()
        counter = Counter()
        for line in dfile:
            key, count = line.strip().split('\t')
            if key in counter.keys():
                raise ValueError("%s contains two instances (words, symbols, ...) of the same spelling." % filename)
            counter[key] = int(count)
    return counter