如何在Python中修复这个n-gram提取器?

def org_ngram(classified_text): orgs = [c for c in classified_text if (c[1]=="ORGANIZATION")] #print(orgs) combined_orgs = [] prev_org = False new_org = ("", "ORGANIZATION") for i in range(len(classified_text)): if classified_text[i][1] != "ORGANIZATION": prev_org = False else: if prev_org: new_org = new_org[0] + " " + classified_text[i][0].lower() else: combined_orgs.append(new_org) new_org = classified_text[i][0].lower() prev_org = True combined_orgs.append(new_org) combined_orgs = combined_orgs[1:] return combined_orgs

from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize st = StanfordNERTagger('C:\\path\\english.all.3class.distsim.crf.ser.gz', 'C:\\Users\\path\\stanford-ner.jar', encoding='utf-8') text = "Trump met with representatives from Sprint International Corporation, Nike Inc, and Wal-Mart Company regarding the trade war." tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) orgs = org_ngram(classified_text) print(orgs)

首先,避免 StanfordNERTagger ,它很快就会被弃用。改用这个 Stanford Parser and NLTK

>>> from nltk.parse import CoreNLPParser

# Lexical Parser
>>> parser = CoreNLPParser(url='http://localhost:9000')

>>> ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
>>> list(ner_tagger.tag(('Rami Eid is studying at Stony Brook University in NY'.split())))
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]

https://stackoverflow.com/a/30666949/610569

from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent


def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree

def extract_ner(ne_tagged_sent):
    ne_tree = stanfordNE2tree(ne_tagged_sent)

    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    return ne_in_sent

然后:

ne_tagged_sent = [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), 
('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), 
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), 
('in', 'O'), ('NY', 'LOCATION')]

print(extract_ner(ne_tagged_sent))

[出局]:

[('Rami Eid', 'PERSON'), ('Stony Brook University', 'ORGANIZATION'), ('NY', 'LOCATION')]