|
|
import pandas as pd
|
|
|
from datetime import datetime
|
|
|
import numpy as np
|
|
|
|
|
|
import re
|
|
|
import logging
|
|
|
|
|
|
# 获取日志记录器
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
def effective_cities(df):
|
|
|
try:
|
|
|
logger.info("开始判断是否81地市,先从地市判断,如果没有,则从县区判断")
|
|
|
|
|
|
# ---------------------------------test---------------------------------------
|
|
|
# df = pd.read_excel(
|
|
|
# r'E:\work_data\work\三工单日报\三工单\20250309\20250309\南方电网话务及投诉统计表(3月9日).xlsx',
|
|
|
# sheet_name='投诉明细(供电类)',
|
|
|
# skiprows=1,
|
|
|
# )
|
|
|
# # 将时间列转成字符串
|
|
|
# df['time'] = df['受理时间'].astype(str)
|
|
|
# # 通过字符串功能格式化时间
|
|
|
# df['time'] = df['time'].str.replace('-', '/')
|
|
|
# # 转成date方便比较
|
|
|
# df['datetime'] = pd.to_datetime(df['time'])
|
|
|
#
|
|
|
# # 开始时间和结束时间
|
|
|
# start_time = datetime(2025, 3, 8, 17, 0, 0)
|
|
|
# end_time = datetime(2025, 3, 9, 17, 0, 0)
|
|
|
# # 拿到供电质量在当天的数据
|
|
|
# df = df[(df['datetime'] > start_time) & (df['datetime'] <= end_time) & (df['一级分类'] == '供电质量')]
|
|
|
#
|
|
|
# # 判断数据区里面是否有深圳
|
|
|
# df['地市'] = df['地市'].astype(str)
|
|
|
#
|
|
|
# df.loc[df['地市'].str.contains('深圳'), '省'] = '深圳'
|
|
|
|
|
|
# ---------------------------------test---------------------------------------
|
|
|
# 判断是否81个地市,如果不是,则忽略
|
|
|
# 省份正则
|
|
|
|
|
|
# 81地市正则
|
|
|
city_list = [
|
|
|
"文昌",
|
|
|
"丽江",
|
|
|
"贺州",
|
|
|
"澄迈",
|
|
|
"迪庆",
|
|
|
"玉林",
|
|
|
"河池",
|
|
|
"百色",
|
|
|
"梧州",
|
|
|
"崇左",
|
|
|
"怒江",
|
|
|
"贵港",
|
|
|
"韶关",
|
|
|
"琼中",
|
|
|
"肇庆",
|
|
|
"文山",
|
|
|
"桂林",
|
|
|
"都匀",
|
|
|
"大理",
|
|
|
"瑞丽",
|
|
|
"南宁",
|
|
|
"汕尾",
|
|
|
"来宾",
|
|
|
"防城港",
|
|
|
"钦州",
|
|
|
"柳州",
|
|
|
"清远",
|
|
|
"阳江",
|
|
|
"六盘水",
|
|
|
"梅州",
|
|
|
"北海",
|
|
|
"昆明",
|
|
|
"兴义",
|
|
|
"揭阳",
|
|
|
"万宁",
|
|
|
"红河",
|
|
|
"定安",
|
|
|
"潮州",
|
|
|
"茂名",
|
|
|
"海口",
|
|
|
"云浮",
|
|
|
"德宏",
|
|
|
"汕头",
|
|
|
"惠州",
|
|
|
"湛江",
|
|
|
"毕节",
|
|
|
"铜仁",
|
|
|
"江门",
|
|
|
"凯里",
|
|
|
"三亚",
|
|
|
"楚雄",
|
|
|
"儋州",
|
|
|
"东莞",
|
|
|
"河源",
|
|
|
"中山",
|
|
|
"珠海",
|
|
|
"临高",
|
|
|
"乐东",
|
|
|
"遵义",
|
|
|
"东方",
|
|
|
"佛山",
|
|
|
"安顺",
|
|
|
"琼海",
|
|
|
"贵阳",
|
|
|
"广州",
|
|
|
"陵水",
|
|
|
"深圳",
|
|
|
"保亭",
|
|
|
"屯昌",
|
|
|
"白沙",
|
|
|
"昌江",
|
|
|
"五指山",
|
|
|
"贵安",
|
|
|
"昭通",
|
|
|
"临沧",
|
|
|
"曲靖",
|
|
|
"西双版纳",
|
|
|
"普洱",
|
|
|
"玉溪",
|
|
|
"保山",
|
|
|
"三沙",
|
|
|
]
|
|
|
|
|
|
# 深圳区正则
|
|
|
sz_district_list = [
|
|
|
"罗湖",
|
|
|
"福田",
|
|
|
"南山",
|
|
|
"宝安",
|
|
|
"龙岗",
|
|
|
"盐田",
|
|
|
"龙华",
|
|
|
"坪山",
|
|
|
"光明",
|
|
|
"大鹏",
|
|
|
]
|
|
|
|
|
|
# 地市判断正则
|
|
|
city_pattern1 = "|".join(city_list)
|
|
|
# 深圳区判断正则
|
|
|
sz_district_pattern = "|".join(sz_district_list)
|
|
|
|
|
|
# 地市里需要将深圳的区改成深圳:“罗湖|福田|南山|宝安|龙岗|盐田|龙华|坪山|光明|大鹏”
|
|
|
# df['地市'] = df['地市'].replace(sz_district_pattern, '深圳', regex=True)
|
|
|
# df["地市"] = np.where(
|
|
|
# df["地市"].fillna('').str.contains(sz_district_pattern, regex=True),
|
|
|
# "深圳", # 是 → 替换成固定值
|
|
|
# np.where(
|
|
|
# df["县区"].fillna('').str.contains(sz_district_pattern, regex=True),
|
|
|
# "深圳", # 是 → 替换成另一个固定值
|
|
|
# df["地市"] # 否 → 保持 col1(或改为 "" / np.nan)
|
|
|
# )
|
|
|
# )
|
|
|
# -------------------------------------看起来是深圳的区,不一定是深圳----------------------------------------------
|
|
|
# 条件1:b 字段匹配正则
|
|
|
mask_b = df["地市"].str.contains(sz_district_pattern, regex=True, na=False)
|
|
|
|
|
|
# 条件2:b 不匹配,但 c 字段匹配正则
|
|
|
mask_c = (~mask_b) & df["县区"].str.contains(
|
|
|
sz_district_pattern, regex=True, na=False
|
|
|
)
|
|
|
|
|
|
# 结合 a 字段的条件(例如,仅当 a 是 1, 3, 5 时才允许替换)
|
|
|
mask_a = df["省"].isin(["广东", "深圳"])
|
|
|
|
|
|
# 最终替换条件:满足 (b匹配 或 c匹配) 且 a 在允许范围内
|
|
|
final_mask = (mask_b | mask_c) & mask_a
|
|
|
|
|
|
# 执行替换
|
|
|
df["地市"] = np.where(final_mask, "深圳", df["地市"])
|
|
|
|
|
|
logger.info(f"判断县区是否有深圳的信息{df['地市']}")
|
|
|
# -------------------------------------看起来是深圳的区,不一定是深圳----------------------------------------------
|
|
|
# 不包含在地市,就在区县取值
|
|
|
df["地市"] = np.where(
|
|
|
df["地市"].fillna("").str.contains(city_pattern1, regex=True),
|
|
|
df["地市"], # 是 → 保留 col1
|
|
|
np.where(
|
|
|
df["县区"].fillna("").str.contains(city_pattern1, regex=True),
|
|
|
df["县区"], # 是 → 取 col2
|
|
|
df["地市"], # 否 → 保持 col1(或改为 "" / np.nan)
|
|
|
),
|
|
|
)
|
|
|
|
|
|
logger.info(f"81地市通过地市及县区修正后的数据{df}")
|
|
|
# df = df[df['地市'].str.contains(city_pattern1)]
|
|
|
# 如果以上述地市开头,并且中间还有信息,则直接用“地市”加“供电局”
|
|
|
# 遍历数组,然后一一替换
|
|
|
for city in city_list:
|
|
|
df["地市"] = df["地市"].apply(lambda x: city if re.search(city, x) else x)
|
|
|
|
|
|
logger.info(f"81地市修改成功")
|
|
|
return df
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.info(f"81地市修改失败{e}")
|
|
|
|
|
|
|
|
|
#
|
|
|
# if __name__ == '__main__':
|
|
|
#
|
|
|
# df = effective_cities()
|
|
|
# print(df.head())
|