import pandas as pd from datetime import datetime import numpy as np import re import logging # 获取日志记录器 logger = logging.getLogger(__name__) def effective_cities(df): try: logger.info("开始判断是否81地市,先从地市判断,如果没有,则从县区判断") # ---------------------------------test--------------------------------------- # df = pd.read_excel( # r'E:\work_data\work\三工单日报\三工单\20250309\20250309\南方电网话务及投诉统计表(3月9日).xlsx', # sheet_name='投诉明细(供电类)', # skiprows=1, # ) # # 将时间列转成字符串 # df['time'] = df['受理时间'].astype(str) # # 通过字符串功能格式化时间 # df['time'] = df['time'].str.replace('-', '/') # # 转成date方便比较 # df['datetime'] = pd.to_datetime(df['time']) # # # 开始时间和结束时间 # start_time = datetime(2025, 3, 8, 17, 0, 0) # end_time = datetime(2025, 3, 9, 17, 0, 0) # # 拿到供电质量在当天的数据 # df = df[(df['datetime'] > start_time) & (df['datetime'] <= end_time) & (df['一级分类'] == '供电质量')] # # # 判断数据区里面是否有深圳 # df['地市'] = df['地市'].astype(str) # # df.loc[df['地市'].str.contains('深圳'), '省'] = '深圳' # ---------------------------------test--------------------------------------- # 判断是否81个地市,如果不是,则忽略 # 省份正则 # 81地市正则 city_list = [ "文昌", "丽江", "贺州", "澄迈", "迪庆", "玉林", "河池", "百色", "梧州", "崇左", "怒江", "贵港", "韶关", "琼中", "肇庆", "文山", "桂林", "都匀", "大理", "瑞丽", "南宁", "汕尾", "来宾", "防城港", "钦州", "柳州", "清远", "阳江", "六盘水", "梅州", "北海", "昆明", "兴义", "揭阳", "万宁", "红河", "定安", "潮州", "茂名", "海口", "云浮", "德宏", "汕头", "惠州", "湛江", "毕节", "铜仁", "江门", "凯里", "三亚", "楚雄", "儋州", "东莞", "河源", "中山", "珠海", "临高", "乐东", "遵义", "东方", "佛山", "安顺", "琼海", "贵阳", "广州", "陵水", "深圳", "保亭", "屯昌", "白沙", "昌江", "五指山", "贵安", "昭通", "临沧", "曲靖", "西双版纳", "普洱", "玉溪", "保山", "三沙", ] # 深圳区正则 sz_district_list = [ "罗湖", "福田", "南山", "宝安", "龙岗", "盐田", "龙华", "坪山", "光明", "大鹏", ] # 地市判断正则 city_pattern1 = "|".join(city_list) # 深圳区判断正则 sz_district_pattern = "|".join(sz_district_list) # 地市里需要将深圳的区改成深圳:“罗湖|福田|南山|宝安|龙岗|盐田|龙华|坪山|光明|大鹏” # df['地市'] = df['地市'].replace(sz_district_pattern, '深圳', regex=True) # df["地市"] = np.where( # df["地市"].fillna('').str.contains(sz_district_pattern, regex=True), # "深圳", # 是 → 替换成固定值 # np.where( # df["县区"].fillna('').str.contains(sz_district_pattern, regex=True), # "深圳", # 是 → 替换成另一个固定值 # df["地市"] # 否 → 保持 col1(或改为 "" / np.nan) # ) # ) # -------------------------------------看起来是深圳的区,不一定是深圳---------------------------------------------- # 条件1:b 字段匹配正则 mask_b = df["地市"].str.contains(sz_district_pattern, regex=True, na=False) # 条件2:b 不匹配,但 c 字段匹配正则 mask_c = (~mask_b) & df["县区"].str.contains( sz_district_pattern, regex=True, na=False ) # 结合 a 字段的条件(例如,仅当 a 是 1, 3, 5 时才允许替换) mask_a = df["省"].isin(["广东", "深圳"]) # 最终替换条件:满足 (b匹配 或 c匹配) 且 a 在允许范围内 final_mask = (mask_b | mask_c) & mask_a # 执行替换 df["地市"] = np.where(final_mask, "深圳", df["地市"]) logger.info(f"判断县区是否有深圳的信息{df['地市']}") # -------------------------------------看起来是深圳的区,不一定是深圳---------------------------------------------- # 不包含在地市,就在区县取值 df["地市"] = np.where( df["地市"].fillna("").str.contains(city_pattern1, regex=True), df["地市"], # 是 → 保留 col1 np.where( df["县区"].fillna("").str.contains(city_pattern1, regex=True), df["县区"], # 是 → 取 col2 df["地市"], # 否 → 保持 col1(或改为 "" / np.nan) ), ) logger.info(f"81地市通过地市及县区修正后的数据{df}") # df = df[df['地市'].str.contains(city_pattern1)] # 如果以上述地市开头,并且中间还有信息,则直接用“地市”加“供电局” # 遍历数组,然后一一替换 for city in city_list: df["地市"] = df["地市"].apply(lambda x: city if re.search(city, x) else x) logger.info(f"81地市修改成功") return df except Exception as e: logger.info(f"81地市修改失败{e}") # # if __name__ == '__main__': # # df = effective_cities() # print(df.head())