Skip to content

DataFrame数据修改常用操作

1、创建DataFrame

python
df = pd.DataFrame(data) 
# 从 CSV 文件读取 
DataFrame df = pd.read_csv('file.csv')
# 推荐使用kpl,可以完整保留行索引与列索引
df_loaded = pd.read_pickle(r'E:\py\df\5555.pkl')

2、选择数据

python
# 选择单列
df['姓名']

# 选择多列  索引列用()
df[['姓名', '城市']]
df[('三年级''1班')]

# 选择行
df.iloc[0]  # 根据索引选择第 0 行

# 条件选择  多个条件用&来串联
df[(df['年龄'] >) 30 & (df['年龄'] < 600)]

3、添加删除列

python
# 添加新列
df['薪资'] = [5000, 6000, 7000]

# 删除列
df.drop('薪资', axis=1, inplace=True)

4、排序与分组求和

python
# 根据某列排序
df.sort_values(by='年龄', ascending=True, inplace=True)

# 分组聚合 平均值  中位数等
grouped = df.groupby('城市').mean()  # 按城市分组并计算平均值

5、数据的横向纵向连接

python
# 合并两个 DataFrame  ,同sql来实现 连接
merged = pd.merge(df1, df2, on='姓名')

# 连接 DataFrame
concat_df = pd.concat([df1, df2], axis=0)  # 纵向连接

6、数据处理

python
# 检查缺失值
df.isnull().sum()
df.isna()

# 删除缺失值
df.dropna(inplace=True)

# 填充缺失值
df.fillna(0, inplace=True)

# 转置 DataFrame
transposed = df.T

7、索引

python
#设置列索引 以及列索引重命名
df = df.set_index(['line_gl_account_type','line_gl_parent_account_name','line_gl_account_name'])
df.index.names = ['Account Type','Parent Account', 'Account']

#重置索引,索引需要唯一不重复
df_loaded = df_loaded.reset_index()

8、pandas.io.formats.style.Styler对象的一些操作

python
#DataFrame执行过df.style.apply就变成了Styler对象,

#添加框线操作
styled_df = styled_df.set_table_styles(
        [{'selector': 'th', 'props': [('border', '1px solid black')]},
         {'selector': 'td', 'props': [('border', '1px solid black')]}])

#左对齐 右对齐操作
styled_df.set_properties(**{'text-align': 'right'})

9、一些无分类操作

python
#添加空白行赋值<br> 就不会被隐藏
empty_rows.iloc[1, df.columns.get_loc(('line_gl_account_type', ''))] = '<br> '

#自定义排序
custom_order = ['Income', 'Cost of Goods Sold', 'Expense', 'Other Income', 'Tax Expense']
df_loaded[('line_gl_account_type', '')] = pd.Categorical(df_loaded[('line_gl_account_type', '')],categories=custom_order, ordered=True)
df_sorted = df_loaded.sort_values(by=[('line_gl_account_type', ''), ('line_gl_parent_account_name', ''), ('groupingid', '')])

#输出到本地html
file_path = r"E:\py\output.html"
styled_df.to_html(file_path)

代码demo

python
import time
import os
import json
import re
import pandas as pd

#   45 Gross Profit
line1 = {"location":{("line_gl_account_type",""): "Cost of Goods Sold"},
        "typpe": "sum",
        "conditions":{("line_gl_account_type",""): ["Income","Cost of Goods Sold"]},
        "alias":{"line_gl_account_type": "Gross Profit","line_gl_parent_account_code": "Gross Profit","line_gl_account_name": "Gross Profit"}}

# 457  Net Ordinary Income
line2 = {"location":{("line_gl_account_type",""): "Expense"},
        "typpe": "sum",
        "conditions":{("line_gl_account_type",""): ["Income","Cost of Goods Sold","Expense"]},
        "alias":{"line_gl_account_type": "Net Ordinary Income","line_gl_parent_account_code": "Net Ordinary Income","line_gl_account_name": "Net Ordinary Income"}}

 # 4578     Income(Loss) Before Taxes
line3 = {"location":{("line_gl_account_type",""): "Other Income"},
        "typpe": "sum",
        "conditions":{("line_gl_account_type",""): ["Income","Cost of Goods Sold","Expense","Other Income"]},
        "alias":{"line_gl_account_type": "Income(Loss) Before Taxes","line_gl_parent_account_code": "Income(Loss) Before Taxes","line_gl_account_name": "Income(Loss) Before Taxes"}}

 # 4578 9    Net Income
line4= {"location":{("line_gl_account_type",""): "Tax Expense"},
        "typpe": "sum",
        "conditions":{("line_gl_account_type",""): ["Income","Cost of Goods Sold","Expense","Other Income","Tax Expense"]},
        "alias":{"line_gl_account_type": "Net Income","line_gl_parent_account_code": "Net Income","line_gl_account_name": "Net Income"}}

lines = [line1,line2,line3,line4]

# col0 = {
#         "location":{("clientid","clientid"): "1000"},
#         "typpe": "sum",
#         # "conditions":{("clientid",""): ["1000","1001","1004","1015"]},
#         "alias":{"ns_subsidiary": "test2","ns_doc_type": "test1"}}
col0 = {}
cols=[col0]
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)  #
pd.set_option('display.width', None)


def df_insert_col(df,cols):
    # 循环添加自定义列
    for col in cols:
        float_int_columns = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
        columns_to_select = [col for col in float_int_columns if col != ("groupingid", "")]

        df[('Total Amount', 'Total Amount')] = df[columns_to_select].sum(axis=1).round(2)

        df[('% of Income', '')] = df[('Total Amount', 'Total Amount')] / df.loc[
            df[("groupingid", "")] == 0, ('Total Amount', 'Total Amount')].sum()
    # print(df)
    # 根据条件添加自定义的列
    return df
def df_insert_line(df,lines):
    # 循环添加自定义行
    # df['line_gl_parent_account_code'] = df['line_gl_parent_account_code'].astype(object)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)  # 显示所有列
    df = df.reset_index(drop=True)
    for line in lines:
        # 根据条件确定添加自定义行的位置
        mask = pd.Series([True] * len(df))
        for col1, value in line['location'].items():
            mask &= (df[col1] == value)
        max_index = df[mask].index.max()

        # 获取添加自定义行的求和条件
        # columns_to_select = list(line['field_define'])
        float_int_columns = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
        columns_to_select = [col for col in float_int_columns if col != ("groupingid", "") or col != ("groupingid", "")]
        # 获取添加自定义行的求和条件
        # 对groupingid为0的所有数据进行过滤
        sum_df = df[df[("groupingid", "")] == 0].reset_index(drop=True)
        # 对condition条件进行过滤求和
        mask = pd.Series([True] * len(sum_df))
        # 遍历条件字典,根据每个条件更新mask
        for key, values in line['conditions'].items():
            mask &= sum_df[key].isin(values)
        # 获取满足条件的行的索引
        indices = sum_df[mask].index
        # 根据条件添加自定义的行
        sum_value = pd.DataFrame()
        if line["typpe"] == "sum":
            sum_value = sum_df.loc[indices, columns_to_select].sum().round(2).to_frame().T
        elif line["typpe"] == "avg":
            sum_value = df[columns_to_select].mean().to_frame().T
        elif line["typpe"] == "max":
            sum_value = df[columns_to_select].max().to_frame().T
        elif line["typpe"] == "min":
            sum_value = df[columns_to_select].min().to_frame().T
        elif line["typpe"] == "count":
            sum_value = df[columns_to_select].count().to_frame().T
        elif line["typpe"] == "std":
            sum_value = df[columns_to_select].std().to_frame().T
        elif line["typpe"] == "var":
            sum_value = df[columns_to_select].var().to_frame().T
        elif line["typpe"] == "median":
            sum_value = df[columns_to_select].median().to_frame().T
        else:
            return {"error": "Invalid type"}
        #自定义行的命名
        for col_name, col_value in line['alias'].items():

            sum_value.insert(0, col_name, col_value)

        sum_value['groupingid'] =-1
         # 第二层
        first_list = [x[0] for x in list(df.columns)]
        second_list = [x[1] for x in list(df.columns)]
        arrays = [first_list, second_list]
        # 创建多层索引对象
        multi_index = pd.MultiIndex.from_arrays(arrays, names=('', ''))
        sum_value.columns = multi_index

        sum_value = sum_value.reset_index(drop=True)
        # 将自定义行添加到原df中
        max_index = max_index + 1
        upper_df = df.iloc[:max_index]
        lower_df = df.iloc[max_index:]
        # 插入到指定位置
        df = pd.concat([upper_df, sum_value, lower_df], ignore_index=True)
    return df

def convert(df_loaded):
    df_loaded = df_loaded.drop(df_loaded.columns[[-1, -3]], axis=1)

    # 获取那些列可以进行计算
    float_int_columns = [col for col in df_loaded.columns if df_loaded[col].dtype in ['float64', 'int64']]
    columns_to_select = [col for col in float_int_columns if col != ("groupingid", "")]

    # 重新生成index
    df_loaded = df_loaded.reset_index()

    # 定义排序的顺序
    custom_order = ['Income', 'Cost of Goods Sold', 'Expense', 'Other Income', 'Tax Expense']
    df_loaded[('line_gl_account_type', '')] = pd.Categorical(df_loaded[('line_gl_account_type', '')],
                                                             categories=custom_order, ordered=True)
    df_sorted = df_loaded.sort_values(
        by=[('line_gl_account_type', ''), ('line_gl_parent_account_name', ''), ('groupingid', '')])
    # 插入行
    df = df_insert_line(df_sorted, lines)
    # 插入列
    df = df_insert_col(df, cols)
    # 创建两行空行

    empty_rows = pd.DataFrame([[None] * len(df.columns)]*2, columns=df.columns)
    empty_rows.iloc[0, df.columns.get_loc(('line_gl_account_type', ''))] = '  '
    empty_rows.iloc[1, df.columns.get_loc(('line_gl_account_type', ''))] = ' '
    empty_rows.iloc[0, df.columns.get_loc(('line_gl_parent_account_name', ''))] = ' '
    empty_rows.iloc[1, df.columns.get_loc(('line_gl_parent_account_name', ''))] = ' '
    empty_rows.iloc[0, df.columns.get_loc(('line_gl_account_name', ''))] = ' '
    empty_rows.iloc[1, df.columns.get_loc(('line_gl_account_name', ''))] = ' '
    # 将空行添加到 DataFrame 的末尾
    df = pd.concat([df, empty_rows], ignore_index=True)

    # 第二部分自定义单元格line1

    df_cell = pd.DataFrame(columns=df.columns)
    df_cell[('line_gl_account_type', '')] = 'Gross Profit Ratio'
    df_cell[('line_gl_parent_account_name', '')] = 'Gross Profit Ratio'
    df_cell[('line_gl_account_name', '')] = 'Gross Profit Ratio'
    if columns_to_select:
        for column in columns_to_select:
            per = df.loc[df[('line_gl_account_type', '')] == 'Gross Profit', column] / df.loc[
                (df[('groupingid', '')] == 0) & (df[('line_gl_account_type', '')] == 'Income'), column].sum()
            df_cell[column] = per
    df_cell[('groupingid', '')] = -2

    df_cell[('line_gl_account_type', '')] = 'Gross Profit Ratio'
    df_cell[('line_gl_parent_account_name', '')] = 'Gross Profit Ratio'
    df_cell[('line_gl_account_name', '')] = 'Gross Profit Ratio'
    df = pd.concat([df, df_cell], ignore_index=True)

    # 第二部分自定义单元格line2

    df_cell = df2 = pd.DataFrame(columns=df.columns)

    if columns_to_select:
        for column in columns_to_select:
            per = df.loc[df[('line_gl_account_type', '')] == 'Net Income', column] / df.loc[
                (df[('groupingid', '')] == 0) & (df[('line_gl_account_type', '')] == 'Income'), column].sum()
            df_cell[column] = per
    df_cell[('groupingid', '')] = -2

    df_cell[('line_gl_account_type', '')] = 'Net Profit Ratio'
    df_cell[('line_gl_parent_account_name', '')] = 'Net Profit Ratio'
    df_cell[('line_gl_account_name', '')] = 'Net Profit Ratio'
    df = pd.concat([df, df_cell], ignore_index=True)

    # 第三部分自定义单元格line3
    df_cell = df2 = pd.DataFrame(columns=df.columns)

    if columns_to_select:
        for column in columns_to_select:
            per = df.loc[df[('line_gl_account_type', '')] == 'Net Ordinary Income', column] + df.loc[
                (df[('groupingid', '')] == 0) & (df[('line_gl_account_name', '')].astype(str).str.startswith(
                    ('50070', '70010', '70015'))), column].sum()
            df_cell[column] = per
    df_cell[('groupingid', '')] = -2

    df_cell[('line_gl_account_type', '')] = 'EBITDA'
    df_cell[('line_gl_parent_account_name', '')] = 'EBITDA'
    df_cell[('line_gl_account_name', '')] = 'EBITDA'
    df = pd.concat([df, df_cell], ignore_index=True)

    # 删除数据为空的行
    df = df[df[('Total Amount', 'Total Amount')] != 0]

    # 将('line_gl_account_type', '') 为'Income', 'Other Income' 的类型为float6464 的数据乘以-1
    columns_to_modify = df.select_dtypes(include=['float64']).columns.difference([('groupingid', '')])
    df.loc[df[('line_gl_account_type', '')].isin(['Income', 'Other Income']), columns_to_modify] *= -1

    df.loc[df[('groupingid', '')] == -1, columns_to_modify] *= -1

    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Income'), (
    'line_gl_parent_account_name', '')] = 'Total Income'
    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Income'), (
    'line_gl_account_name', '')] = 'Total Income'

    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Cost of Goods Sold'), (
    'line_gl_parent_account_name', '')] = 'Total Cost of Goods Sold'
    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Cost of Goods Sold'), (
    'line_gl_account_name', '')] = 'Total Cost of Goods Sold'

    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Expense'), (
    'line_gl_parent_account_name', '')] = 'Total Expense'
    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Expense'), (
    'line_gl_account_name', '')] = 'Total Expense'

    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Other Income'), (
    'line_gl_parent_account_name', '')] = 'Total Other Income'
    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Other Income'), (
    'line_gl_account_name', '')] = 'Total Other Income'

    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Tax Expense'), (
    'line_gl_parent_account_name', '')] = 'Total Tax Expense'
    df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Tax Expense'), (
    'line_gl_account_name', '')] = 'Total Tax Expense'

    condition = df[('groupingid', '')].isin([1, 3])
    # 赋值
    df.loc[condition & df[('line_gl_account_type', '')].isna(), 'line_gl_account_type'] = 'Subtotal'
    df.loc[condition & df[('line_gl_parent_account_name', '')].isna(), 'line_gl_parent_account_name'] = 'Subtotal'
    df.loc[condition & df[('line_gl_account_name', '')].isna(), 'line_gl_account_name'] = 'Subtotal'

    df = df.set_index(['line_gl_account_type',
                       'line_gl_parent_account_name',
                       'line_gl_account_name'])
    df.index.names = [
        'Account Type',
        'Parent Account',
        'Account'
    ]
    df = df[df[('groupingid', '')] != 7]

    def highlight_row_red(row):
        # 如果 'groupingid' 列的值为 3,则整行标记为红色背景
        if row[('groupingid', '')] == 1:
            return ['background-color: #e6f7e6;font-weight: bold' for _ in row]  # 中红色
        elif row[('groupingid', '')] == 3:
            return ['background-color: #cce5ff;font-weight: bold' for _ in row]  # 中红色
        elif row[('groupingid', '')] == -1:
            return ['background-color: #66b3ff;font-weight: bold' for _ in row]  # 浅蓝色
        elif row[('groupingid', '')] == -2:
            return ['background-color: #1a75ff;font-weight: bold' for _ in row]  # 中蓝色
        else:
            return ['' for _ in row]  # 默认无色

    # 应用样式
    # df = df.round(2)
    for column in df.select_dtypes(include=['float64']).columns:
        df[column] = df[column].apply(lambda x: f"{x:,.2f}" if pd.notna(x) and x > 1 else x)

    for col in df.index[-3:]:
        df[col] = df[col].apply(lambda x: f"{x:.2%}" if pd.notna(x) else x)

        # 应用样式
    # float_columns = df.select_dtypes(include='float64').columns
    styled_df = df.style.apply(highlight_row_red, axis=1).format(na_rep='', precision=2)

    # 加边框线
    styled_df = styled_df.set_table_styles(
        [{'selector': 'th', 'props': [('border', '1px solid black')]},
         {'selector': 'td', 'props': [('border', '1px solid black')]}]
    )

    # float_cols = [('2024-02', 'line_net_amount')]
    # text_cols = [('2024-06', 'line_net_amount')]

    # text_cols = df.select_dtypes(include=['object']).columns
    # # 查找 float64 类型的列,并对这些列进行右对齐
    # float_cols = df.select_dtypes(include=['float64']).columns

    styled_df = styled_df.set_properties(**{'text-align': 'right'})



    # styled_df =  styled_df.hide_columns([('groupingid', '')])
    # 获取那些列可以进行计算

    return styled_df
# if __name__ == '__main__':
#     df_loaded = pd.read_pickle(r'E:\py\df\5555.pkl')
#     convert(df_loaded)