使用python从文本文件提取内容时出现问题

def find_no_line_start_table(table_title,splited_data): found_no_lines = [] for index, line in enumerate(splited_data): if table_title in line: found_no_lines.append(index) return found_no_lines def get_start_data_table(table_start, splited_data): for index, row in enumerate(splited_data[table_start:]): if 'Dollars' in row: return table_start + index def get_end_table(start_table_data, splited_data ): for index, row in enumerate(splited_data[start_table_data:]): if END_TABLE_LINE in row: return start_table_data + index def row(l): l = l.split() number_columns = 6 if len(l) >= number_columns: data_row = [''] * number_columns first_column_done = False index = 0 for w in l: if not first_column_done: data_row[0] = ' '.join([data_row[0], w]) if ':' in w: first_column_done = True else: index += 1 data_row[index] = w return data_row def take_table(txt_data): comodity = [] q = [] w = [] e = [] t = [] p = [] for r in table: data_row = row(r) if data_row: col_1, col_2, col_3, col_4, col_5, col_6 = data_row comodity.append(col_1) q.append(col_2) w.append(col_3) e.append(col_4) t.append(col_5) p.append(col_6) table_data = {'comodity': comodity, 'q': q, 'w': w, 'e': e, 't': t} return table_data

import requests import pandas as pd txt_data = requests.get("https://downloads.usda.library.cornell.edu/usda-esmis/files/c821gj76b/6w924d00c/9z903130m/AgriPric-07-30-2010.txt").text splited_data = txt_data.split('\n') table_title = 'Prices Received, United States' END_TABLE_LINE = '-------------------------------------------' _, table_start,_ = find_no_line_start_table(table_title,splited_data) start_line = get_start_data_table(table_start, splited_data) end_line = get_end_table(start_line, splited_data) table = splited_data[start_line : end_line] dict_table = take_table(txt_data) pd.DataFrame(dict_table) c = pd.DataFrame(dict_table)

错误原因:

data_row 是一个列表 6 元素。

number_columns = 6
# ...
    data_row = [''] * number_columns  # [''] * 6

和 index 将随每次迭代而增加,其中 first_column_done = True 。但是 first_column_done 将 True 什么时候 : 在一个单词中遇到,即

if ':' in w:
    first_column_done = True

因此,对于之后的每次迭代 第一列完成 转动 真 , 指数 将递增,直到超过 六 这是名单的范围 数据行 .

def row(l):
    l = l.split()
    number_columns = 6
    if len(l) >= number_columns: 
        data_row = [''] * number_columns
        first_column_done = False
        index = 0
        for w in l:
            if not first_column_done:
                data_row[0] = ' '.join([data_row[0], w])
                if ':' in w:
                    first_column_done = True
            else:
                index += 1
                data_row[index] = w    # error pos.

换言之,对于包含大于 6 - index 在第一次发生 : 在那行的一个字之内。

修复:

使用 split(':') 和 list comprehension 和蟒蛇一样 tertiary operator .

def row(l):
    row = [ col.strip() for col in l.split(':') ]
    row[2:] = row[2].split()
    return [ row[i] if i < len(row) else '' for i in range(6) ]