import utils

beta = 1
# https://aclanthology.org/W04-1013/


def rouge_n(reference_summary, created_summary, n, pp_options=None, extended_results=False):
    """
    Calculates the rouge n score

    :param reference_summary: gold standard summary
    :param created_summary: summary to evaluate
    :param n: size of n-grams
    :param pp_options: list of options for preprocessing, if None then no preprocessing will be done
    :param extended_results: indicates, whether, precision, recall and f-measure should be returned
    :return: the score or (precision, recall, f-measure) if extended results are wanted
    """
    # preprocess
    if pp_options is not None:  # otherwise don't preprocess. Text is already preprocessed
        reference_summary = utils.preprocess_text(reference_summary, pp_options)
        created_summary = utils.preprocess_text(created_summary, pp_options)
    else:  # seperate sentence marks from tokens
        for sentence_mark in utils.sentence_marks:
            reference_summary = reference_summary.replace(sentence_mark, ' '+sentence_mark)
            created_summary = created_summary.replace(sentence_mark, ' ' + sentence_mark)
    # split into n-grams of size n
    # count occurances of single ngrams
    reference_ngrams, ref_complete_count = count_n_grams(reference_summary, n)
    created_ngrams, created_complete_count = count_n_grams(created_summary, n)

    overlapping_count = 0
    for ref_key in reference_ngrams.keys():
        created_count = created_ngrams.get(ref_key)
        if created_count is not None:  # ngrams in both dicts
            ref_count = reference_ngrams[ref_key]
            overlapping_count += min(ref_count, created_count)

    # calculate score
    if ref_complete_count == 0:
        return 0
    recall = overlapping_count / ref_complete_count
    if extended_results:
        precision = overlapping_count / created_complete_count
        return precision, recall, (2*precision*recall) / (precision+recall)
    return recall


def count_n_grams(pp_summary, n):
    """
    Counts the n-grams of the given size in a summary.

    :param pp_summary: Pre-processed summary
    :param n: n for the size of ngrams
    :return: {ngram:count} for all ngrams in the summary
    """
    words = pp_summary.split(' ')
    complete_count = 0
    n_grams = {}
    for i in range(len(words)-(n-1)):
        n_gram = ' '.join(words[i:i+n])
        if n_gram != '':
            complete_count += 1
            count = n_grams.get(n_gram)
            if count is None:
                count = 0
            n_grams[n_gram] = count + 1
    return n_grams, complete_count


def rouge_l(reference_summary, created_summary, pp_options=None, extended_results=False):
    """
    Calculates the rouge-l value of a summary and its gold standard summary

    :param reference_summary: Gold standard summary
    :param created_summary: Created summary to compare
    :param pp_options: options for preprocessing, if None then there will be no preprocessing
    :param extended_results: if True, precision, recall and f-score will be returned
    :return: The calculated score, if extended results are wanted (precision, recall, f-measure)
    """
    # preprocess
    if pp_options is not None:  # otherwise don't preprocess. Text is already preprocessed
        reference_summary = utils.preprocess_text(reference_summary, pp_options)
        created_summary = utils.preprocess_text(created_summary, pp_options)
    # seperate sentence marks from words
    # split into sentences
    m_reference_word_number = len(reference_summary.split(' '))
    reference_summary = utils.split_into_sentences(reference_summary)
    n_created_word_number = len(created_summary.split(' '))
    created_summary = utils.split_into_sentences(created_summary)
    total_sum_subsequences = 0
    # to make sure every word in the created summary is used only once
    used_created_indices = [set()]*len(created_summary)
    used_gold_indices = [set()]*len(reference_summary)
    for j in range(len(reference_summary)):
        ref_sentence = reference_summary[j]
        # calculate union longest subsequence
        for i in range(len(created_summary)):
            created_sentence = created_summary[i]
            indices_a, indices_b = get_subsequence(ref_sentence, created_sentence)
            used_gold_indices[j] = (used_gold_indices[j]).union(indices_a)
            used_created_indices[i] = (used_created_indices[i]).union(indices_b)
    # used indices of b here to ensure words arent used twice
    used_created_indices = [len(sent_set) for sent_set in used_created_indices]
    used_gold_indices = [len(sent_set) for sent_set in used_gold_indices]
    total_sum_subsequences = min(sum(used_gold_indices), sum(used_created_indices))

    if total_sum_subsequences == 0:
        return 0
    p_lcs = total_sum_subsequences / n_created_word_number
    r_lcs = total_sum_subsequences / m_reference_word_number
    f_lcs = ((1 + beta * beta) * r_lcs*p_lcs) / (r_lcs + beta * beta * p_lcs)
    if extended_results:
        return p_lcs, r_lcs, f_lcs
    return f_lcs


def get_subsequence(sent_a, sent_b):
    """
    Finds all (not necessarily consecutive) subsequences of a in b.
    :param sent_a: Sentence to find subsequences from
    :param sent_b: Sentence to find subsequence in
    :return: (ind_a, ind_b) two sets of indices of sent_a and sent_b of the longest subsequence
    """
    result_a = set()
    words_a = sent_a.split(' ')
    words_b = sent_b.split(' ')
    for word_index_a in range(len(words_a)):
        word_result = set()
        char_index_b = 0
        while word_index_a < len(words_a):
            # word is contained
            try:
                found_index = words_b.index(words_a[word_index_a], char_index_b)
                word_result.add(word_index_a)
                char_index_b = found_index
                word_index_a += 1
            except ValueError:
                # word not in b contained, do nth
                word_index_a += 1
        if len(word_result) > len(result_a):
            result_a = word_result
    result_b = set([words_b.index(words_a[a_ind]) for a_ind in result_a])
    return result_a, result_b


if __name__ == "__main__":

    print('Done')