You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
report_app/app/tools/effective_cities.py

209 lines
6.6 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
from datetime import datetime
import numpy as np
import re
import logging
# 获取日志记录器
logger = logging.getLogger(__name__)
def effective_cities(df):
try:
logger.info("开始判断是否81地市先从地市判断如果没有则从县区判断")
# ---------------------------------test---------------------------------------
# df = pd.read_excel(
# r'E:\work_data\work\三工单日报\三工单\20250309\20250309\南方电网话务及投诉统计表3月9日.xlsx',
# sheet_name='投诉明细(供电类)',
# skiprows=1,
# )
# # 将时间列转成字符串
# df['time'] = df['受理时间'].astype(str)
# # 通过字符串功能格式化时间
# df['time'] = df['time'].str.replace('-', '/')
# # 转成date方便比较
# df['datetime'] = pd.to_datetime(df['time'])
#
# # 开始时间和结束时间
# start_time = datetime(2025, 3, 8, 17, 0, 0)
# end_time = datetime(2025, 3, 9, 17, 0, 0)
# # 拿到供电质量在当天的数据
# df = df[(df['datetime'] > start_time) & (df['datetime'] <= end_time) & (df['一级分类'] == '供电质量')]
#
# # 判断数据区里面是否有深圳
# df['地市'] = df['地市'].astype(str)
#
# df.loc[df['地市'].str.contains('深圳'), '省'] = '深圳'
# ---------------------------------test---------------------------------------
# 判断是否81个地市如果不是则忽略
# 省份正则
# 81地市正则
city_list = [
"文昌",
"丽江",
"贺州",
"澄迈",
"迪庆",
"玉林",
"河池",
"百色",
"梧州",
"崇左",
"怒江",
"贵港",
"韶关",
"琼中",
"肇庆",
"文山",
"桂林",
"都匀",
"大理",
"瑞丽",
"南宁",
"汕尾",
"来宾",
"防城港",
"钦州",
"柳州",
"清远",
"阳江",
"六盘水",
"梅州",
"北海",
"昆明",
"兴义",
"揭阳",
"万宁",
"红河",
"定安",
"潮州",
"茂名",
"海口",
"云浮",
"德宏",
"汕头",
"惠州",
"湛江",
"毕节",
"铜仁",
"江门",
"凯里",
"三亚",
"楚雄",
"儋州",
"东莞",
"河源",
"中山",
"珠海",
"临高",
"乐东",
"遵义",
"东方",
"佛山",
"安顺",
"琼海",
"贵阳",
"广州",
"陵水",
"深圳",
"保亭",
"屯昌",
"白沙",
"昌江",
"五指山",
"贵安",
"昭通",
"临沧",
"曲靖",
"西双版纳",
"普洱",
"玉溪",
"保山",
"三沙",
]
# 深圳区正则
sz_district_list = [
"罗湖",
"福田",
"南山",
"宝安",
"龙岗",
"盐田",
"龙华",
"坪山",
"光明",
"大鹏",
]
# 地市判断正则
city_pattern1 = "|".join(city_list)
# 深圳区判断正则
sz_district_pattern = "|".join(sz_district_list)
# 地市里需要将深圳的区改成深圳:“罗湖|福田|南山|宝安|龙岗|盐田|龙华|坪山|光明|大鹏”
# df['地市'] = df['地市'].replace(sz_district_pattern, '深圳', regex=True)
# df["地市"] = np.where(
# df["地市"].fillna('').str.contains(sz_district_pattern, regex=True),
# "深圳", # 是 → 替换成固定值
# np.where(
# df["县区"].fillna('').str.contains(sz_district_pattern, regex=True),
# "深圳", # 是 → 替换成另一个固定值
# df["地市"] # 否 → 保持 col1或改为 "" / np.nan
# )
# )
# -------------------------------------看起来是深圳的区,不一定是深圳----------------------------------------------
# 条件1b 字段匹配正则
mask_b = df["地市"].str.contains(sz_district_pattern, regex=True, na=False)
# 条件2b 不匹配,但 c 字段匹配正则
mask_c = (~mask_b) & df["县区"].str.contains(
sz_district_pattern, regex=True, na=False
)
# 结合 a 字段的条件(例如,仅当 a 是 1, 3, 5 时才允许替换)
mask_a = df[""].isin(["广东", "深圳"])
# 最终替换条件:满足 (b匹配 或 c匹配) 且 a 在允许范围内
final_mask = (mask_b | mask_c) & mask_a
# 执行替换
df["地市"] = np.where(final_mask, "深圳", df["地市"])
logger.info(f"判断县区是否有深圳的信息{df['地市']}")
# -------------------------------------看起来是深圳的区,不一定是深圳----------------------------------------------
# 不包含在地市,就在区县取值
df["地市"] = np.where(
df["地市"].fillna("").str.contains(city_pattern1, regex=True),
df["地市"], # 是 → 保留 col1
np.where(
df["县区"].fillna("").str.contains(city_pattern1, regex=True),
df["县区"], # 是 → 取 col2
df["地市"], # 否 → 保持 col1或改为 "" / np.nan
),
)
logger.info(f"81地市通过地市及县区修正后的数据{df}")
# df = df[df['地市'].str.contains(city_pattern1)]
# 如果以上述地市开头,并且中间还有信息,则直接用“地市”加“供电局”
# 遍历数组,然后一一替换
for city in city_list:
df["地市"] = df["地市"].apply(lambda x: city if re.search(city, x) else x)
logger.info(f"81地市修改成功")
return df
except Exception as e:
logger.info(f"81地市修改失败{e}")
#
# if __name__ == '__main__':
#
# df = effective_cities()
# print(df.head())