Python社区  »  Python

如何在Python中修复这个n-gram提取器?

Aubrey • 6 月前 • 42 次点击  

我做了一个n-gram提取器,可以从文本中提取组织的名称。然而,程序只提取第一个单词和最后一个单词的第一个字母。例如,如果 "Sprint International Corporation" 出现在文本中,程序将返回 "s corporation" 作为n-克。你知道我做错了什么吗?我已经发布了下面的代码和输出。谢谢。

def org_ngram(classified_text):
    orgs = [c for c in classified_text if (c[1]=="ORGANIZATION")]
    #print(orgs)

    combined_orgs = []
    prev_org = False
    new_org = ("", "ORGANIZATION")
    for i in range(len(classified_text)):
        if classified_text[i][1] != "ORGANIZATION":
            prev_org = False
        else:
            if prev_org:
                new_org = new_org[0] + " " + classified_text[i][0].lower()
            else:
                combined_orgs.append(new_org)
                new_org = classified_text[i][0].lower()
            prev_org = True

    combined_orgs.append(new_org)
    combined_orgs = combined_orgs[1:]
    return combined_orgs

这是我分析的文本和我用来分析它的程序。

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('C:\\path\\english.all.3class.distsim.crf.ser.gz',
                       'C:\\Users\\path\\stanford-ner.jar',
                       encoding='utf-8')

text = "Trump met with representatives from Sprint International Corporation, Nike Inc, and Wal-Mart Company regarding the trade war."

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
orgs = org_ngram(classified_text)

print(orgs)

这是电流输出。

['s corporation', 'n inc', 'w company']

这就是我想要输出的样子。

['sprint international corporation', 'nike inc', 'wal-mart company']
Python社区是高质量的Python/Django开发社区
本文地址:http://www.python88.com/topic/56801
 
42 次点击  
分享到微博
文章 [ 1 ]  |  最新文章 6 月前
alvas
Reply   •   1 楼
alvas    7 月前

首先,避免 StanfordNERTagger ,它很快就会被弃用。改用这个 Stanford Parser and NLTK

>>> from nltk.parse import CoreNLPParser

# Lexical Parser
>>> parser = CoreNLPParser(url='http://localhost:9000')

>>> ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
>>> list(ner_tagger.tag(('Rami Eid is studying at Stony Brook University in NY'.split())))
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]

https://stackoverflow.com/a/30666949/610569

from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent


def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree

def extract_ner(ne_tagged_sent):
    ne_tree = stanfordNE2tree(ne_tagged_sent)

    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    return ne_in_sent

然后:

ne_tagged_sent = [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), 
('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), 
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), 
('in', 'O'), ('NY', 'LOCATION')]

print(extract_ner(ne_tagged_sent))

[出局]:

[('Rami Eid', 'PERSON'), ('Stony Brook University', 'ORGANIZATION'), ('NY', 'LOCATION')]