# -*- coding: utf-8 -*- # Natural Language Toolkit: Gale-Church Aligner # # Copyright (C) 2001-2018 NLTK Project # Author: Torsten Marek # Contributor: Cassidy Laidlaw, Liling Tan # URL: # For license information, see LICENSE.TXT """ A port of the Gale-Church Aligner. Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora. http://aclweb.org/anthology/J93-1004.pdf """ from __future__ import division import math try: from scipy.stats import norm from norm import logsf as norm_logsf except ImportError: def erfcc(x): """Complementary error function.""" z = abs(x) t = 1 / (1 + 0.5 * z) r = t * math.exp(-z * z - 1.26551223 + t * (1.00002368 + t * (.37409196 + t * (.09678418 + t * (-.18628806 + t * (.27886807 + t * (-1.13520398 + t * (1.48851587 + t * (-.82215223 + t * .17087277))))))))) if x >= 0.: return r else: return 2. - r def norm_cdf(x): """Return the area under the normal distribution from M{-∞..x}.""" return 1 - 0.5 * erfcc(x / math.sqrt(2)) def norm_logsf(x): try: return math.log(1 - norm_cdf(x)) except ValueError: return float('-inf') LOG2 = math.log(2) class LanguageIndependent(object): # These are the language-independent probabilities and parameters # given in Gale & Church # for the computation, l_1 is always the language with less characters PRIORS = { (1, 0): 0.0099, (0, 1): 0.0099, (1, 1): 0.89, (2, 1): 0.089, (1, 2): 0.089, (2, 2): 0.011, } AVERAGE_CHARACTERS = 1 VARIANCE_CHARACTERS = 6.8 def trace(backlinks, source_sents_lens, target_sents_lens): """ Traverse the alignment cost from the tracebacks and retrieves appropriate sentence pairs. :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS) :type backlinks: dict :param source_sents_lens: A list of target sentences' lengths :type source_sents_lens: list(int) :param target_sents_lens: A list of target sentences' lengths :type target_sents_lens: list(int) """ links = [] position = (len(source_sents_lens), len(target_sents_lens)) while position != (0, 0) and all(p >=0 for p in position): try: s, t = backlinks[position] except TypeError: position = (position[0]-1 , position[1]-1) continue for i in range(s): for j in range(t): links.append((position[0] - i - 1, position[1] - j - 1)) position = (position[0] - s, position[1] - t) return links[::-1] def align_log_prob(i, j, source_sents, target_sents, alignment, params): """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]} being aligned with a specific C{alignment}. @param i: The offset of the source sentence. @param j: The offset of the target sentence. @param source_sents: The list of source sentence lengths. @param target_sents: The list of target sentence lengths. @param alignment: The alignment type, a tuple of two integers. @param params: The sentence alignment parameters. @returns: The log probability of a specific alignment between the two sentences, given the parameters. """ l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0])) l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1])) try: # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C # reference implementation. With l_s in the denominator, insertions are impossible. m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2 delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(m * params.VARIANCE_CHARACTERS) except ZeroDivisionError: return float('-inf') return - (LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment])) def align_blocks(source_sents_lens, target_sents_lens, params = LanguageIndependent): """Return the sentence alignment of two text blocks (usually paragraphs). >>> align_blocks([5,5,5], [7,7,7]) [(0, 0), (1, 1), (2, 2)] >>> align_blocks([10,5,5], [12,20]) [(0, 0), (1, 1), (2, 1)] >>> align_blocks([12,20], [10,5,5]) [(0, 0), (1, 1), (1, 2)] >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12]) [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)] @param source_sents_lens: The list of source sentence lengths. @param target_sents_lens: The list of target sentence lengths. @param params: the sentence alignment parameters. @return: The sentence alignments, a list of index pairs. """ alignment_types = list(params.PRIORS.keys()) # there are always three rows in the history (with the last of them being filled) D = [[]] backlinks = {} for i in range(len(source_sents_lens) + 1): for j in range(len(target_sents_lens) + 1): min_dist = float('inf') min_align = None for a in alignment_types: prev_i = - 1 - a[0] prev_j = j - a[1] if prev_i < -len(D) or prev_j < 0: continue p = D[prev_i][prev_j] + align_log_prob(i, j, source_sents_lens, target_sents_lens, a, params) if p < min_dist: min_dist = p min_align = a if min_dist == float('inf'): min_dist = 0 backlinks[(i, j)] = min_align D[-1].append(min_dist) if len(D) > 2: D.pop(0) D.append([]) return trace(backlinks, source_sents_lens, target_sents_lens) def align_texts(source_blocks, target_blocks, params = LanguageIndependent): """Creates the sentence alignment of two texts. Texts can consist of several blocks. Block boundaries cannot be crossed by sentence alignment links. Each block consists of a list that contains the lengths (in characters) of the sentences in this block. @param source_blocks: The list of blocks in the source text. @param target_blocks: The list of blocks in the target text. @param params: the sentence alignment parameters. @returns: A list of sentence alignment lists """ if len(source_blocks) != len(target_blocks): raise ValueError("Source and target texts do not have the same number of blocks.") return [align_blocks(source_block, target_block, params) for source_block, target_block in zip(source_blocks, target_blocks)] # File I/O functions; may belong in a corpus reader def split_at(it, split_value): """Splits an iterator C{it} at values of C{split_value}. Each instance of C{split_value} is swallowed. The iterator produces subiterators which need to be consumed fully before the next subiterator can be used. """ def _chunk_iterator(first): v = first while v != split_value: yield v v = it.next() while True: yield _chunk_iterator(it.next()) def parse_token_stream(stream, soft_delimiter, hard_delimiter): """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens) and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function. """ return [ [sum(len(token) for token in sentence_it) for sentence_it in split_at(block_it, soft_delimiter)] for block_it in split_at(stream, hard_delimiter)] # Code for test files in nltk_contrib/align/data/*.tok # import sys # from contextlib import nested # with nested(open(sys.argv[1], "r"), open(sys.argv[2], "r")) as (s, t): # source = parse_token_stream((l.strip() for l in s), ".EOS", ".EOP") # target = parse_token_stream((l.strip() for l in t), ".EOS", ".EOP") # print align_texts(source, target)