微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

Python editdistance 模块-eval() 实例源码

Python editdistance 模块,eval() 实例源码

我们从Python开源项目中,提取了以下48代码示例,用于说明如何使用editdistance.eval()

项目:keras    作者:GeekLiB    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
项目:pCVR    作者:xjtushilei    | 项目源码 | 文件源码
def show_edit_distance(self, mean_norm_ed))
项目:markov-sentence-correction    作者:anassinator    | 项目源码 | 文件源码
def total_distance(observed_sentence, corrected_sentence):
    """Calculates the total distance between the two given sentences.

    Args:
        observed_sentence: Observed sentence.
        corrected_sentence: Corrected sentence.

    Returns:
        Total Levenshtein distance between the two sentences.
    """
    total_distance = 0

    observed_words = list(observed_sentence)
    corrected_words = list(corrected_sentence)

    for i in range(len(observed_words)):
        comparable_words = observed_words[i], corrected_words[i]
        total_distance += editdistance.eval(*comparable_words)

    return total_distance
项目:thaanaOCR    作者:Sofwath    | 项目源码 | 文件源码
def show_edit_distance(self, mean_norm_ed))
项目:keras-customized    作者:ambrite    | 项目源码 | 文件源码
def show_edit_distance(self, mean_norm_ed))
项目:keras-mxnet-benchmarks    作者:sandeep-krishnamurthy    | 项目源码 | 文件源码
def show_edit_distance(self, mean_norm_ed))
项目:chat-roulette-python    作者:ph4r05    | 项目源码 | 文件源码
def similarities(self):
        """
        Compute Levenshtein distance matrix between files (implemented in C++ pip package: editdistance)
        Later: https://docs.python.org/2/library/difflib.html
        :return:
        """

        ucos = sorted(self.filedb.keys())
        sims = {}

        for idx, uco in enumerate(ucos):
            logger.info('Comparing %s...' % uco)
            sims[uco] = {}

            for idx2, uco2 in enumerate(ucos[idx+1:]):
                dist = editdistance.eval(self.file_data[uco], self.file_data[uco2])
                sims[uco][uco2] = dist
                logger.info(' %6d vs %6d : %4d  %s  %s' % (uco, uco2, dist, self.filedb[uco], self.filedb[uco2]))
项目:event-cui-transfer    作者:mit-ddig    | 项目源码 | 文件源码
def best_match(word, corrected_med_list, corrected_english_list):
    min_dist_med = len(word)
    best_med_word = ''
    min_dist_eng = len(word)
    best_eng_word = ''
    for word_t in corrected_med_list:
        if editdistance.eval(word, word_t) < min_dist_med:
            min_dist_med = editdistance.eval(word, word_t)
            best_med_word = word_t

    for word_t in corrected_english_list:
        if editdistance.eval(word, word_t) < min_dist_eng:
            min_dist_eng = editdistance.eval(word, word_t)
            best_eng_word = word_t
    if min_dist_med <= min_dist_eng:
        return best_med_word
    else:
        return best_eng_word
项目:keras    作者:NVIDIA    | 项目源码 | 文件源码
def show_edit_distance(self, mean_norm_ed))
项目:ws-backend-community    作者:lavalamp-    | 项目源码 | 文件源码
def compare_strings_by_edit_distance(first=None, second=None):
        """
        Get the edit distance between the two strings passed to this method.
        :param first: The first string to compare.
        :param second: The second string to compare.
        :return: A number representing the edit distance between the two strings passed
        as arguments to this method.
        """
        return editdistance.eval(first, second)

    # Class Methods

    # Public Methods

    # Protected Methods

    # Private Methods

    # Properties

    # Representation and Comparison
项目:keras-101    作者:burness    | 项目源码 | 文件源码
def show_edit_distance(self, mean_norm_ed))
项目:rebuild_obfuscator    作者:irobert-tluo    | 项目源码 | 文件源码
def simscore(a1, b1):
        max_len = max([len(a1), len(b1)])
        if max_len == 0:
            return 0
        dist = editdistance.eval(a1, b1)
        if dist > max_len:
          print dist
        return 1.0 - (float(dist)/float(max_len))
项目:rebuild_obfuscator    作者:irobert-tluo    | 项目源码 | 文件源码
def similarity(a1, b1):
  max_len = max([len(a1), len(b1)])
  if max_len == 0:
      return 0
  dist = editdistance.eval(a1, b1)
  return 1.0 - (float(dist)/float(max_len))
项目:speechless    作者:JuliusKunze    | 项目源码 | 文件源码
def letter_error_count(self) -> float:
        return editdistance.eval(self.expected, self.predicted)
项目:speechless    作者:JuliusKunze    | 项目源码 | 文件源码
def word_error_count(self) -> float:
        return editdistance.eval(self.expected_words, self.predicted.split())
项目:DeepLearning-OCR    作者:xingjian-f    | 项目源码 | 文件源码
def edit_dis(a, b):
    return editdistance.eval(a, b)
项目:pe    作者:anguelos    | 项目源码 | 文件源码
def getEditdistanceMat(gtTranscriptions,sampleTranscriptions):
    outputShape=[len(gtTranscriptions),len(sampleTranscriptions)]
    distMat=np.empty(outputShape)
    maxSizeMat=np.empty(outputShape)
    for gtNum in range(len(gtTranscriptions)):
        for sampleNum in range(len(sampleTranscriptions)):
            distMat[gtNum,sampleNum]=editdistance.eval(gtTranscriptions[gtNum],sampleTranscriptions[sampleNum])
            maxSizeMat[gtNum,sampleNum]=max(len(gtTranscriptions[gtNum]),len(sampleTranscriptions[sampleNum]))
    return distMat/maxSizeMat,distMat
项目:json-merger    作者:inveniosoftware-contrib    | 项目源码 | 文件源码
def _normalized_edit_dist(s1, s2):
    return float(editdistance.eval(s1, s2)) / max(len(s1), len(s2), 1)
项目:Library-Identification    作者:Riscure    | 项目源码 | 文件源码
def compare_cc_list_levenshtein(sample, ref):
    """
    Compares the cyclomatic complexity values of all functions in `sample`
    with those of all functions in `ref`,by taking the Levenshtein distance
    between these lists. This detects added/removed functions and functions
    that have changed in complexity between a sample and a reference.
    """
    if hasattr(ref, 'cclist') and ref.cclist is not None:
        ratio = 1 - (editdistance.eval(sample.cclist, ref.cclist)
                    / float(max(len(sample.cclist), len(ref.cclist))))
    else:
        ratio = 0.0

    return (ratio * 100, ref.name, ref.version)
项目:pandora    作者:mikekestemont    | 项目源码 | 文件源码
def annotate(self, tokens):
        X_focus = self.preprocessor.transform(tokens=tokens)['X_focus']
        X_context = self.pretrainer.transform(tokens=tokens)

        # get predictions:
        new_in = {}
        if self.include_token:
            new_in['focus_in'] = X_focus
        if self.include_context:
            new_in['context_in'] = X_context
        preds = self.model.predict(new_in)

        if isinstance(preds, np.ndarray):
            preds = [preds]

        annotation_dict = {'tokens': tokens}
        if self.include_lemma:
            pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=preds[self.lemma_out_idx])
            annotation_dict['lemmas'] = pred_lemmas
            if self.postcorrect:
                for i in range(len(pred_lemmas)):
                    if pred_lemmas[i] not in self.kNown_lemmas:
                        pred_lemmas[i] = min(self.kNown_lemmas,
                                            key=lambda x: editdistance.eval(x, pred_lemmas[i]))
                annotation_dict['postcorrect_lemmas'] = pred_lemmas

        if self.include_pos:
            pred_pos = self.preprocessor.inverse_transform_pos(predictions=preds[self.pos_out_idx])
            annotation_dict['pos'] = pred_pos

        if self.include_morph:
            pred_morph = self.preprocessor.inverse_transform_morph(predictions=preds[self.morph_out_idx])
            annotation_dict['morph'] = pred_morph

        return annotation_dict
项目:WebMan    作者:flipflop97    | 项目源码 | 文件源码
def searchPackages(name):
    results = loadJson('https://www.archlinux.org/packages/search/json/?q=%s' % name)['results']
    results = sorted(results, key=lambda x: levdist(name, x['pkgname']))[:100]
    packages = [parsePackage(package, name) for package in results if package['arch'] in (arch, 'any')]

    results = loadJson('https://aur.archlinux.org/rpc/?v=5&type=search&arg=%s' % name)['results']
    results = sorted(results, x['Name']))[:100]
    packages += [parsePackage(package, name) for package in results]

    packages = sorted(packages, x[0]))[:100]
    return packages
项目:atropos    作者:jdidion    | 项目源码 | 文件源码
def set_trimming(self, u, t, use_edit_distance=True):
        untrimmed = u.query_sequence.upper()
        untrimmed_len = len(untrimmed)
        trimmed = t.query_sequence.upper()
        trimmed_len = len(trimmed)

        trimmed_front = 0 if use_edit_distance else -1
        if use_edit_distance and (untrimmed_len > trimmed_len):
            for i in range(untrimmed_len - trimmed_len + 1):
                if untrimmed[i:(i+trimmed_len)] == trimmed:
                    trimmed_front = i
                    break
            else:
                # Since Skewer performs automatic error correction,the trimmed and
                # untrimmed reads may not match,so in that case we find the closest
                # match by Levenshtein distance.
                dist = None
                for i in range(untrimmed_len - trimmed_len + 1):
                    d = editdistance.eval(untrimmed[i:(i+trimmed_len)], trimmed)
                    if not dist:
                        dist = d
                    elif d < dist:
                        trimmed_front = i
                        dist = d

        self.trimmed_front = trimmed_front
        self.trimmed_back = untrimmed_len - (trimmed_len + trimmed_front)
项目:sequtils    作者:atgtag    | 项目源码 | 文件源码
def edit(seq1, seq2):
    """
    Wrapper around editdistance.eval for fast Levenshtein
    distance computation.

    Args:
        seq1 (str): Reference sequence
        seq2 (str): Sequence to compare

    Examples:
        >>> edit('banana','bahama')
        2
    """
    return int(ed.eval(seq1, seq2))
项目:kaggle    作者:rbauld    | 项目源码 | 文件源码
def edit_distance(train_in, test_in, qcolumns = ['question1', 'question2'], append=''):

    train = train_in.copy().loc[:,qcolumns]
    test = test_in.copy().loc[:,qcolumns]

    import editdistance

    def my_fun(row, qcolumns):
        return editdistance.eval(row[qcolumns[0]], row[qcolumns[1]])

    key = 'edit_dist'+append
    train[key] = train.apply(lambda x: my_fun(x, qcolumns=qcolumns), axis=1)
    test[key]  = test.apply(lambda x: my_fun(x, axis=1)

    return (train, test)
项目:social-vuln-scanner    作者:Betawolf    | 项目源码 | 文件源码
def bestNameDiff(profileone, profiletwo):
    """ Applies Levenshtein distance between best names of two profiles."""
    n1 = profileone.bestname()
    n2 = profiletwo.bestname()
    if (not n1) or (not n2):
      return 0
    l1 = profileone.name_length
    l2 = profiletwo.name_length
    diff = editdistance.eval(n1,n2)
    return 1-(diff/(l1 if l1 > l2 else l2))
项目:social-vuln-scanner    作者:Betawolf    | 项目源码 | 文件源码
def string_sim(n1, n2):
    """ Applies Levenshtein distance between strings."""
    if (not n1) or (not n2):
      return 0
    l1 = len(n1)
    l2 = len(n2)
    diff = editdistance.eval(n1,n2)
    return 1-(diff/(l1 if l1 > l2 else l2))
项目:Mandalorion    作者:christopher-vollmers    | 项目源码 | 文件源码
def collect_file_paths(path,gene_file):
   genes_of_interest=[]
   for line in open(gene_file):
       genes_of_interest.append(line.strip())

   isoform_list=[]
   gene_read_counter={}
   isoform_read_counter={}
   for gene in genes_of_interest:
       gene_read_counter[gene]=0
       for file1 in sorted(os.listdir(path+'/parsed_reads')):
           if gene in file1:

               file2=file1+'_sub'
               out_sub=open(path+'/parsed_reads/'+file2,'w') 
               counter=0
               isoform_reads=read_fasta(path+'/parsed_reads/'+file1)
               isoform_read_list=list(isoform_reads.keys())
               print(gene_read_counter,gene_read_counter[gene],len(isoform_reads.keys()))
               gene_read_counter[gene]+=len(isoform_reads.keys())
               isoform_read_counter[path+'/parsed_reads/'+file2]=len(isoform_reads.keys())
               read1 = isoform_read_list[0]
               out_sub.write('>'+read1+'\n'+isoform_reads[read1]+'\n')
               for read2 in isoform_read_list[1::]:
                   if counter<subsample:
                       out_sub.write('>'+read2+'\n')
                       dist_1 = editdistance.eval(isoform_reads[read1],isoform_reads[read2])**2/float(len(isoform_reads[read1])*len(isoform_reads[read2]))
                       dist_2 = editdistance.eval(isoform_reads[read1],reverse_complement(isoform_reads[read2]))**2/float(len(isoform_reads[read1])*len(isoform_reads[read2]))
                       if dist_1 < dist_2:
                           out_sub.write(isoform_reads[read2]+'\n')
                       else:
                           out_sub.write(reverse_complement(isoform_reads[read2])+'\n')
                   counter+=1


               isoform_list.append((path+'/parsed_reads/'+file2,gene))

   return isoform_list,gene_read_counter,isoform_read_counter
项目:wub    作者:nanoporetech    | 项目源码 | 文件源码
def test_simulate_sequencing_errors(self):
        """Test function simulating sequencing errors."""
        error_rate = 0.1
        error_weights = {'substitution': 1.0 / 6,
                         'insertion': 1.0 / 6,
                         'deletion': 4.0 / 6}
        sequence = sim_seq.simulate_sequence(5000)
        mutated_record = sim_seq.simulate_sequencing_errors(
            sequence, error_rate, error_weights)
        distance = editdistance.eval(sequence, mutated_record.seq)
        expected_errors = len(sequence) * error_rate
        errors_sd = np.sqrt(len(sequence) * error_rate * (1 - error_rate))
        # Should pass 0.9973 proportion of cases:
        self.assertTrue(expected_errors - errors_sd * 3 < distance < expected_errors +
                        errors_sd * 3, msg="expected: {} realised:{}".format(expected_errors, distance))
项目:OCkRE    作者:rossumai    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        wrong = 0
        right = 0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], word_batch['the_input'][0:num_proc], word_batch['labeltype_input'][0:num_proc])
            for j in range(0, num_proc):
                ocr_result = deaccent(unicode(re.sub("[\+\/]", "", re.sub("\\s", decoded_res[j])), 'utf-8'))
                gold_label = re.sub("[\+\/]", word_batch['source_str'][j]))
                if gold_label == ocr_result:
                    right += 1
                else:
                    wrong += 1
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        absacc = float(right) / (float(right) + float(wrong))
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        outline = ' Out of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f\n Absolute accuracy over labels is %0.2f\n' % (
            num, mean_norm_ed, absacc)
        print(outline)

        return mean_norm_ed, absacc
项目:rctw17    作者:bgshih    | 项目源码 | 文件源码
def text_distance(str1, str2):
  str1 = normalize_txt(str1)
  str2 = normalize_txt(str2)
  return editdistance.eval(str1, str2)
项目:speechT    作者:timediv    | 项目源码 | 文件源码
def track_decoding(self, decoded_str, expected_str):
    self.letter_edit_distance = editdistance.eval(expected_str, decoded_str)
    self.letter_error_rate = self.letter_edit_distance / len(expected_str)
    self.word_edit_distance = editdistance.eval(expected_str.split(), decoded_str.split())
    self.word_error_rate = self.word_edit_distance / len(expected_str.split())
    self.sum_letter_edit_distance += self.letter_edit_distance
    self.sum_letter_error_rate += self.letter_error_rate
    self.sum_word_edit_distance += self.word_edit_distance
    self.sum_word_error_rate += self.word_error_rate
    self.decodings_counter += 1
项目:speechT    作者:timediv    | 项目源码 | 文件源码
def run_step(self, model: SpeechModel, sess: tf.Session, stats: EvalStatistics,
               save: bool, verbose=True, Feed_dict: Dict=None):
    global_step = model.global_step.eval()

    # Validate on data set and write summary
    if save:
      avg_loss, decoded, label, summary = model.step(sess, update=False, decode=True, return_label=True,
                                                     summary=True, Feed_dict=Feed_dict)
      model.summary_writer.add_summary(summary, global_step)
    else:
      avg_loss, label = model.step(sess,
                                            return_label=True, Feed_dict=Feed_dict)

    if verbose:
      perplexity = np.exp(float(avg_loss)) if avg_loss < 300 else float("inf")
      print("validation average loss {:.2f} perplexity {:.2f}".format(avg_loss, perplexity))

    # Print decode
    decoded_ids_paths = [Evaluation.extract_decoded_ids(path) for path in decoded]
    for label_ids in Evaluation.extract_decoded_ids(label):
      expected_str = speecht.vocabulary.ids_to_sentence(label_ids)
      if verbose:
        print('expected: {}'.format(expected_str))
      for decoded_path in decoded_ids_paths:
        decoded_ids = next(decoded_path)
        decoded_str = speecht.vocabulary.ids_to_sentence(decoded_ids)
        stats.track_decoding(decoded_str, expected_str)
        if verbose:
          print('decoded: {}'.format(decoded_str))
          print('LED: {} LER: {:.2f} WED: {} WER: {:.2f}'.format(stats.letter_edit_distance,
                                                                 stats.letter_error_rate,
                                                                 stats.word_edit_distance,
                                                                 stats.word_error_rate))
项目:inflation_calc    作者:EricSchles    | 项目源码 | 文件源码
def closest(self, date=datetime.date.today(), country=None,
                limit=datetime.timedelta(days=366)):
        """
        Get the closest CPI value for a specified date. The date defaults to
        today. A limit can be provided to exclude all values for dates further
        away than defined by the limit. This defaults to 366 days.
        """

        # Try to get the country
        try:
            possible_countries = [self.data[country]]          
        except:
            possible_countries = [elem for elem in self.data.keys() if editdistance.eval(country,elem) < 3]
            if len(possible_countries) == 0:
                return "No country found,typo unlikely for ",country

        # Find the closest date
        country_cpi = {}
        for country in possible_countries:
            min_year_diff = 1000
            min_year = 0
            for year in self.data[country]:
                if min_year_diff > abs(date.year - int(year)):
                    min_year_diff = abs(date.year - int(year))
                    min_year = year
            country_cpi[country] = self.data[country][min_year]
        if len(country_cpi) == 1:
            return country_cpi[country_cpi.keys()[0]]
        else:
            return country_cpi
项目:inflation_calc    作者:EricSchles    | 项目源码 | 文件源码
def closest(self,country

        # Find the closest date
        country_cpi = {}
        for country in possible_countries:
            min_year_diff = 1000
            min_year = 0
            for year in self.data[country]:
                if min_year_diff > abs(date.year - int(year)):
                    min_year_diff = abs(date.year - int(year))
                    min_year = year
            country_cpi[country] = self.data[country][min_year]
        if len(country_cpi) == 1:
            return country_cpi[country_cpi.keys()[0]]
        else:
            return country_cpi
项目:speech    作者:awni    | 项目源码 | 文件源码
def compute_cer(results):
    """
    Arguments:
        results (list): list of ground truth and
            predicted sequence pairs.

    Returns the CER for the full set.
    """
    dist = sum(editdistance.eval(label, pred)
                for label, pred in results)
    total = sum(len(label) for label, _ in results)
    return dist / total
项目:tensorflow-quorakaggle    作者:ram1988    | 项目源码 | 文件源码
def __evaluateLevensteindistance(self, question1, question2):
        leven_dis = levendis.eval(question1.lower(), question2.lower())
        return leven_dis
项目:panphon    作者:dmort27    | 项目源码 | 文件源码
def fast_levenshtein_distance(self, source, target):
        """Wrapper for the distance function in the Levenshtein module

        Args:
            source (unicode): source word
            target (unicode): target word

        Returns:
            int: minimum number of Levenshtein edits required to get from
                 `source` to `target`
        """
        return int(editdistance.eval(source, target))
项目:panphon    作者:dmort27    | 项目源码 | 文件源码
def fast_levenshtein_distance_div_maxlen(self, target):
        """Levenshtein distance divided by maxlen

        Args:
            source (unicode): source word
            target (unicode): target word

        Returns:
            int: minimum number of Levenshtein edits required to get from
                 `source` to `target` divided by the length of the longest
                 of these arguments
        """
        maxlen = max(len(source), len(target))
        return int(editdistance.eval(source, target)) / maxlen
项目:agrigento    作者:ucsb-seclab    | 项目源码 | 文件源码
def calc_score(value, values):
    distance = 1000000000
    for v in values:
        if len(value) == len(v):
            d = bit_edit_distance(value, v)
        else:
            d = editdistance.eval(value, v) * 8
        distance = min(distance, d)

    return distance
项目:attention_ocr    作者:lightcaster    | 项目源码 | 文件源码
def batched_wer(ref, hyp):
    ''' Computes mean WER 

    ref: list of references
    hyp: list of corresponding hypotheses

    '''
    assert len(ref) == len(hyp)

    wer = 0.
    for r,f in zip(ref, hyp):
        rate = editdistance.eval(r, f) / len(r)
        wer += rate

    return wer/len(ref)
项目:dnnQuery    作者:richardxiong    | 项目源码 | 文件源码
def strSimilarity(word1, word2):
    ''' Measure the similarity based on Edit distance
    ### Measure how similar word1 is with respect to word2
    '''
    diff = ed.eval(word1.lower(), word2.lower())   #search
    # lcs = LCS(word1,word2)   #search
    length = max(len(word1), len(word2))
    if diff >= length:
        similarity = 0.0
    else:
        similarity = 1.0 * (length-diff) / length
    return similarity
项目:dnnQuery    作者:richardxiong    | 项目源码 | 文件源码
def strSimilarity(word1, len(word2))
    if diff >= length:
        similarity = 0.0
    else:
        similarity = 1.0 * (length-diff) / length
    return similarity
项目:dnnQuery    作者:richardxiong    | 项目源码 | 文件源码
def strSimilarity(word1, len(word2))
    if diff >= length:
        similarity = 0.0
    else:
        similarity = 1.0 * (length-diff) / length
    return similarity
项目:dnnQuery    作者:richardxiong    | 项目源码 | 文件源码
def strSimilarity(word1, len(word2))
    if diff >= length:
        similarity = 0.0
    else:
        similarity = 1.0 * (length-diff) / length
    return similarity
项目:pe    作者:anguelos    | 项目源码 | 文件源码
def getFSNSMetrics(gtIdTransDict,methodIdTransDict):
    """Provides metrics for the FSNS dataset.
    FM,precision,recall and correctSequences are an implementation of the metrics described in
    "End-to-End Interpretation of the french Street Name Signs Dataset"
    [https://link.springer.com/chapter/10.1007%2F978-3-319-46604-0_30]
    Params:
        gtIdTransDict : sample_id to data dictionary. A simple file name to file contents might do.
        methodIdTransDict : sample_id to data dictionary. A simple file name to file contents might do.

    returns:
        A tuple with floats between 0 and 1 with all worth reporting measurements.
        FM,Precision,Recall,global correct word trascriptions,if someone returned
        "rue" as the transcription of every image,assuming half the images have it,he
        would get a precision of 50%,a recall of ~5% and an FM of ~9.1%.
        He would get a correctSequences score of 0%,and a similarity of e%.
    """
    def compareTexts(sampleTxt,gtTxt):
        relevant=gtTxt.lower().split()
        retrieved=sampleTxt.lower().split()
        correct=(set(relevant).intersection(set(retrieved)))
        similarity=1.0/(1+editdistance.eval(gtTxt.lower(),sampleTxt.lower()))
        res=(len(correct),len(relevant),len(retrieved),relevant==retrieved,similarity)
        return res
    mDict={k:'' for k in gtIdTransDict.keys()}
    mDict.update(methodIdTransDict)
    methodIdTransDict=mDict
    methodKeys=sorted(methodIdTransDict.keys())
    gtKeys=sorted(gtIdTransDict.keys())
    if len(methodKeys)!= len(set(methodKeys))  or len(gtKeys)!= len(set(gtKeys)) or len(set(methodKeys)-set(gtKeys))>0 :#gt and method dissagree on samples
        sys.stderr.write("GT and submission dissagree on the sample ids\n")
        sys.exit(1)
    corectRelevantRetrievedSimilarity=np.zeros([len(gtKeys),5],dtype='float32')
    for k in range(len(gtKeys)):
        sId=gtKeys[k]
        corectRelevantRetrievedSimilarity[k,:]=compareTexts(methodIdTransDict[sId],gtIdTransDict[sId])
    precision=(corectRelevantRetrievedSimilarity[:,0].sum()/(corectRelevantRetrievedSimilarity[:,1].sum()))
    recall=(corectRelevantRetrievedSimilarity[:,2].sum()))
    FM=(2*precision*recall)/(precision+recall)
    correctSequences=corectRelevantRetrievedSimilarity[:,3].mean()
    similarity=corectRelevantRetrievedSimilarity[:,4].mean()
    combinedSoftMetric=(1-FM)*FM+FM*similarity#The better FM is,the less it maters in the overall score
    return combinedSoftMetric,FM,precision,recall,similarity,correctSequences,corectRelevantRetrievedSimilarity
项目:markov-sentence-correction    作者:anassinator    | 项目源码 | 文件源码
def _correct(observed_sentence, bigrams, distribution, max_error_rate):
    """Corrects a given sentence.

    Note: The lower the max_error_rate,the faster the algorithm,but the
          likelier it will fail.

    Args:
        observed_sentence: Observed sentence.
        bigrams: First-order Markov chain of likely word sequences.
        distribution: Error probability distribution function.
        max_error_rate: Maximum number of errors in a word to consider.

    Returns:
        Ordered list of tuples of (corrected sentence,its probability).
        Most likely interpretations come first.
    """
    trellis = [{Sentence.START: (1.0, None)}]

    observed_words = list(observed_sentence)
    number_of_words = len(observed_words)

    for k in range(1, number_of_words):
        observed_word = observed_words[k]
        max_errors = int(len(observed_word) * max_error_rate) + 1

        current_states = {}
        prevIoUs_states = trellis[k - 1]
        trellis.append(current_states)

        for prevIoUs_word in prevIoUs_states:
            prevIoUs_prob = prevIoUs_states[prevIoUs_word][0]

            future_states = bigrams.yield_future_states((prevIoUs_word,))
            for possible_word, conditional_prob in future_states:
                # Conditional probability: P(X_k | X_k-1) * prevIoUs
                # probability.
                total_prob = conditional_prob * prevIoUs_prob

                # Emission probability: P(E_k | X_k).
                distance = editdistance.eval(observed_word, possible_word)
                total_prob *= distribution(distance)

                # Ignore states that have too many mistakes.
                if distance > max_errors:
                    continue

                # Only keep link of max probability.
                if possible_word in current_states:
                    if current_states[possible_word][0] >= total_prob:
                        continue

                current_states[possible_word] = (total_prob, prevIoUs_word)

    # Find most likely ending.
    interpretations = list(_backtrack_path(trellis, x) for x in trellis[-1])
    interpretations.sort(key=lambda x: x[1], reverse=True)

    return interpretations
项目:pandora    作者:mikekestemont    | 项目源码 | 文件源码
def test(self, multilabel_threshold=0.5):
        if not self.include_test:
            raise ValueError('Please do not call .test() if no test data is available.')

        score_dict = {}

        # get test predictions:
        test_in = {}
        if self.include_token:
            test_in['focus_in'] = self.test_X_focus
        if self.include_context:
            test_in['context_in'] = self.test_contexts

        test_preds = self.model.predict(test_in,
                                batch_size=self.batch_size)

        if isinstance(test_preds, np.ndarray):
            test_preds = [test_preds]

        if self.include_lemma:
            print('::: Test scores (lemmas) :::')

            pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=test_preds[self.lemma_out_idx])
            if self.postcorrect:
                for i in range(len(pred_lemmas)):
                    if pred_lemmas[i] not in self.kNown_lemmas:
                        pred_lemmas[i] = min(self.kNown_lemmas,
                                        key=lambda x: editdistance.eval(x, pred_lemmas[i]))
            score_dict['test_lemma'] = evaluation.single_label_accuracies(gold=self.test_lemmas,
                                                 silver=pred_lemmas,
                                                 test_tokens=self.test_tokens,
                                                 kNown_tokens=self.preprocessor.kNown_tokens)

        if self.include_pos:
            print('::: Test scores (pos) :::')
            pred_pos = self.preprocessor.inverse_transform_pos(predictions=test_preds[self.pos_out_idx])
            score_dict['test_pos'] = evaluation.single_label_accuracies(gold=self.test_pos,
                                                 silver=pred_pos,
                                                 kNown_tokens=self.preprocessor.kNown_tokens)

        if self.include_morph:     
            print('::: Test scores (morph) :::')
            pred_morph = self.preprocessor.inverse_transform_morph(predictions=test_preds[self.morph_out_idx],
                                                                   threshold=multilabel_threshold)
            if self.include_morph == 'label':
                score_dict['test_morph'] = evaluation.single_label_accuracies(gold=self.test_morph,
                                                 silver=pred_morph,
                                                 kNown_tokens=self.preprocessor.kNown_tokens)                
            elif self.include_morph == 'multilabel':
                score_dict['test_morph'] = evaluation.multilabel_accuracies(gold=self.test_morph,
                                                 kNown_tokens=self.preprocessor.kNown_tokens)
        return score_dict
项目:handelsregister    作者:Amsterdam    | 项目源码 | 文件源码
def fix_ambiguous(ambiguous_sbi):
    """
    For each ambiguous sbi code find to most likely candidate

     0       vs.id,
     1       vs.naam,
     2       codes.hr_code,
     3       codes.alt_code,
     4       codes.title,
     5       codes.alt_title,
     6       codes.sub_cat,
     7       codes.alt_sub_cat,
     8       codes.mks_title

    """
    original_count = 0
    suggestion_count = 0

    for row in ambiguous_sbi:

        normalcode = row[2]
        zerocode = row[3]

        desc1 = row[4]
        desc2 = row[5]
        original = row[8]

        distance_desc1 = editdistance.eval(desc1, original)
        distance_desc2 = editdistance.eval(desc2, original)

        if distance_desc1 > distance_desc2:
            # the alternative match with 0 is better
            suggestion_count += 1
            ves = hrmodels.Vestiging.objects.get(id=row[0])
            invalid_activiteit = ves.activiteiten.get(sbi_code=normalcode)
            # fix the code
            invalid_activiteit.sbi_code = zerocode
            # save the corrected sbi code
            invalid_activiteit.save()
            # Now save updated code
        else:
            # do nothing default is fine
            original_count += 1

        log.debug(f'{normalcode},{zerocode},{desc1[:18]},{desc2[:18]},{original[:18]},{distance_desc1},{distance_desc2}')  # noqa

    log.debug("%s-%s = Original-Suggestion", original_count, suggestion_count)

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐