根據輸入地址,利用已有的地址編碼文件,構造處理規則策略識別地址的編碼。
lib/address.json 地址編碼文件(這個文件太大,博客里放不下,需要的話可以到 gitcode 倉庫獲取:https://gitcode.com/TomorrowAndTuture/address_code)
{"110000000000": {"province_name": "北京市","city_datas": {"110100000000": {"city_name": "市轄區","district_datas": {"110101000000": {"district_name": "東城區","town_datas": {"110101001000": {"town_name": "東華門街道","village_datas": {"110101001001": "多福巷社區居委會","110101001002": "銀閘社區居委會","110101001005": "東廠社區居委會","110101001006": "智德社區居委會",
...
main.py
根據輸入的詳細地址,返回該地址的地址編碼(會盡可能查找到更詳盡的編碼)
import json
import re
import traceback
from typing import Dict
import logging
import os
from logging import handlersdef _logging(**kwargs):level = kwargs.pop('level', logging.DEBUG)filename = kwargs.pop('filename', 'default.log')datefmt = kwargs.pop('datefmt', '%Y-%m-%d %H:%M:%S')format = kwargs.pop('format', '[%(asctime)s,%(msecs)d][%(module)s][%(levelname)s] %(lineno)d - %(message)s')log = logging.getLogger(filename)format_str = logging.Formatter(format, datefmt)th = handlers.TimedRotatingFileHandler(filename=filename, when='MIDNIGHT', backupCount=30, encoding="utf-8")th.suffix = "%Y%m%d.log"th.extMatch = re.compile(r"^\d{4}\d{2}\d{2}(\.\w+)?$", re.ASCII)th.setFormatter(format_str)th.setLevel(level)log.addHandler(th)log.setLevel(level)return logroot_dir = os.path.dirname(os.path.abspath(__file__))
lib_dir = os.path.join(root_dir, "lib")
logs_dir = os.path.join(root_dir, "logs")os.makedirs(logs_dir, exist_ok=True)
os.makedirs(lib_dir, exist_ok=True)
logger = _logging(filename="./logs/address.log")address_file_path = os.path.join(lib_dir, "address.json")
# province_file_path = os.path.join(lib_dir, "province.json") # 省份
# city_file_path = os.path.join(lib_dir, "city.json") # 城市
# district_file_path = os.path.join(lib_dir, "district.json") # 區縣
# town_file_path = os.path.join(lib_dir, "town.json") # 鄉鎮
# village_file_path = os.path.join(lib_dir, "village.json") # 村鎮class AddressHandler:def __init__(self):self.address_datas: Dict[str] = {}self.province_dict: Dict[str, str] = {}self.city_dict: Dict[str, str] = {}self.district_dict: Dict[str, str] = {}self.town_dict: Dict[str, str] = {}self.village_dict: Dict[str, str] = {}self.load_datas()def load_datas(self):logger.info(f"load address data ...")self.address_datas = json.load(open(address_file_path, encoding='utf-8'))for province_code in self.address_datas:province_info = self.address_datas[province_code]province_name = province_info["province_name"]self.province_dict[province_code] = province_name # 獲取省份編碼映射city_datas = province_info["city_datas"]for city_code in city_datas:city_info = city_datas[city_code]city_name = city_info["city_name"]self.city_dict[city_code] = city_name # 獲取城市編碼映射district_datas = city_info["district_datas"]for district_code in district_datas:district_info = district_datas[district_code]district_name = district_info["district_name"]self.district_dict[district_code] = district_name # 獲取區縣編碼映射town_datas = district_info["town_datas"]for town_code in town_datas:town_info = town_datas[town_code]town_name = town_info["town_name"]self.town_dict[town_code] = town_name # 獲取區縣編碼映射village_datas = town_info["village_datas"]for village_code in village_datas:village_name = village_datas[village_code]self.village_dict[village_code] = village_namedef get_province_info(self, address: str):for code, name in self.province_dict.items():tmp_name = str(name).replace("省", "").replace("市", "").replace("自治區", "").replace("維吾爾", "").replace("壯族", "").replace("回族", "")if address.startswith(tmp_name):return code, namereturn '', ''def get_city_info(self, address: str, province_code: str = ""):if province_code:code_prefix = province_code[:2]for code, name in self.city_dict.items():tmp_name = str(name).replace("市", "").replace("自治州", "")if tmp_name in address and code.startswith(code_prefix):return code, nameelse:for code, name in self.city_dict.items():tmp_name = str(name).replace("市", "").replace("自治州", "")if tmp_name in address:return code, namereturn '', ''def get_district_info(self, address: str, province_code: str):code_prefix = province_code[:2]for code, name in self.province_dict.items():if name in address and code.startswith(code_prefix):return code, namereturn '', ''@staticmethoddef replace_folk(town: str):folks = ["回族", "滿族", "蒙古族", "俄羅斯族", "朝鮮族", "傈僳族", "錫伯族", "達斡爾族", "柯爾克孜族", "鄂倫春族","畬族", "土家族", "侗族", "瑤族", "苗族", "維吾爾族", "白族", "壯族", "仫佬族", "仡佬族", "彝族", "藏族","羌族", "傣族", "納西族", "白族", "水族", "毛南族", "普米族", "哈尼族", "佤族", "拉祜族", "德昂族", "布朗族","基諾族", "阿昌族", "怒族", "東鄉族", "土族", "哈薩克族", "塔吉克族"]for folk in folks:town = town.replace(folk, "")return towndef get_district_town_village_info(self, district_datas, tmp_split_after_city_address, city_code):district_code = ""district_name = ""town_code = ""town_name = ""village_code = ""village_name = ""logger.info(f"遞進查詢區縣、鄉鎮和村鎮信息")for k1, v1 in district_datas.items():base_district_name = v1["district_name"]if not base_district_name:continuetmp_base_district_name = base_district_nameif tmp_base_district_name in tmp_split_after_city_address and k1.startswith(city_code[:4]):district_code = k1district_name = base_district_name# 獲取區縣后部分地址tmp_split_after_district_address = tmp_split_after_city_address.split(tmp_base_district_name, 1)[-1]town_datas = v1.get("town_datas", {})for k2, v2 in town_datas.items():base_town_name = v2["town_name"]tmp_base_town_name = base_town_nametmp_base_town_name = self.replace_folk(tmp_base_town_name)if len(tmp_base_town_name) <= 1:tmp_base_town_name = base_town_nameif tmp_base_town_name in tmp_split_after_district_address and k2.startswith(district_code[:6]):town_code = k2town_name = base_town_name# 獲取鄉鎮后部分地址tmp_split_after_town_address = tmp_split_after_district_address.split(tmp_base_town_name, 1)[-1]village_datas = v2.get("village_datas", {})for k3, v3 in village_datas.items():base_village_name = str(3)# 去掉額外字符,提高村鎮識別精度tmp_base_village_name = base_village_name \.replace("村村民委員會", "村") \.replace("村民委員會", "村") \.replace("村委會", "村") \.replace("村村民居委會", "村") \.replace("社區居委會", "社區") \.replace("居民委員會", "") \.replace("居委會", "") \.replace("委員會", "") \.replace("委會", "")if len(tmp_base_village_name) <= 1:tmp_base_village_name = base_village_nameif tmp_base_village_name in tmp_split_after_town_address and k3.startswith(town_code[:9]):village_code = k3village_name = base_village_namereturn district_name, district_code, town_name, town_code, village_name, village_codereturn district_name, district_code, town_name, town_code, village_name, village_codedef handle_address(self, address: str):status_code = 1info = "success"address_code = "000000000000"logger.info(f"地址:{address}")province_code, province_name = self.get_province_info(address)if not province_code:logger.info("未查詢到省份信息,先略過省份查詢,優先查詢城市信息")city_code, city_name = self.get_city_info(address_code)if city_code:province_code = city_code[:2] + '0' * 10province_name = self.province_dict.get(province_code)logger.info(f"優先查詢到城市:{city_name}")else:info = "省份和城市信息均未查到"logger.info(info)return status_code, info, address_codelogger.info(f"解析到省份:{province_name}")address_code = province_codeprovince_datas = self.address_datas.get(province_code, {})city_datas = province_datas.get("city_datas", {})city_code, city_name = self.get_city_info(address, province_code)if not city_code:logger.info(f"未查詢到城市信息,先略過城市查詢,優先查詢區縣信息")district_code, district_name = self.get_district_info(address, province_code)if district_code:city_code = district_code[:4] + '0' * 8city_name = self.city_dict.get(city_code)logger.info(f"優先查詢到曲線:{district_name}")else:info = "城市和區縣均未查到"logger.info(info)return status_code, info, address_codelogger.info(f"解析到城市:{city_name}")tmp_city_name = city_name.replace("市", "").replace("自治州", "").replace("地區", "")address_code = city_codetmp_split_after_city_address = address.split(tmp_city_name, 1)[-1]district_datas = city_datas.get(city_code, {}).get("district_datas", {})district_name, district_code, town_name, town_code, village_name, village_code = self.get_district_town_village_info(district_datas, tmp_split_after_city_address, city_code)logger.info(f"區縣:{district_name or None};鄉鎮:{town_name or None};村鎮:{village_name or None}")# 如果遞進未匹配到鄉鎮,則再正則匹配查找if town_name == "":logger.info(f"鄉鎮信息未查到,繼續使用正則表達式再次匹配查找")address_match_pattern = r"(.*?省|.*?自治區|.*?市)?(.*?市|.*?自治州)?(.*?區|.*?縣)?(.*?鎮|.*?鄉|.*?街道|.*?街|.*?辦事處)?(.*?村|.*?社區|.*?路)?"search_obj = re.search(address_match_pattern, address)if search_obj:match_town_name = search_obj.group(4)match_village_name = search_obj.group(5)if match_town_name and town_name == "":town_name = match_town_nameif match_village_name and village_name == "":village_name = match_village_namelogger.info(f"正則匹配到鄉鎮和村鎮信息:{match_town_name};{match_village_name}")# 將正則匹配到的鄉鎮和村鎮與編碼表中的數據進行比對查找if match_town_name:for code, name in self.town_dict.items():if not name:continueif district_code:if not code[0:6] == district_code[0:6]:continueelse:if not code[0:4] == city_code[0:4]:continuetmp_name = nameif len(tmp_name) <= 1:tmp_name = nameif tmp_name in match_town_name or match_town_name in tmp_name:logger.info(f"正則查找到鄉鎮信息:{name}")town_code = codetown_name = namebreakif match_village_name:for code, name in self.village_dict.items():if not name:continueif town_code:if not code[0:9] == town_code[0:9]:continueelif district_code:if not code[0:6] == district_code[0:6]:continueelse:if not code[0:4] == city_code[0:4]:continuetmp_name = name \.replace("村村民委員會", "村") \.replace("村民委員會", "村") \.replace("村委會", "村") \.replace("村村民居委會", "村") \.replace("社區居委會", "社區") \.replace("居民委員會", "") \.replace("居委會", "") \.replace("委員會", "") \.replace("委會", "")if len(tmp_name) <= 1:tmp_name = nameif tmp_name in match_village_name or match_village_name in tmp_name:logger.info(f"正則查找到村鎮信息:{name}")village_code = codevillage_name = namebreak# 反向替換if not town_code and village_code:town_code = village_code[0:9] + "0" * 3town_name = self.town_dict.get(town_code)logger.info(f"反向查找到鄉鎮:{town_name}")if not district_code and town_code:district_code = town_code[0:6] + "0" * 6district_name = self.district_dict.get(district_code)logger.info(f"反向查找到區縣:{district_name}")address_code = village_code or town_code or district_code or city_code or address_codelogger.info(f"{province_name}: {province_code}, {city_name}: {city_code}, {district_name}: {district_code}, {town_name}: {town_code}, {village_name}: {village_code}")return status_code, info, address_codeaddress_handle = AddressHandler()def process(address):logger.info("========== start process ==========")output_data = {"code": -1,"info": "fail","address_code": "",}try:status_code, info, address_code = address_handle.handle_address(address)output_data["code"] = status_codeoutput_data["info"] = infooutput_data["address_code"] = address_codeexcept Exception as e:logger.error(traceback.format_exc())output_data["code"] = -1output_data["info"] = f"fail: {e}"finally:logger.info(f"output_data: {output_data}")logger.info("========== end process ==========")return output_dataif __name__ == '__main__':address_list = ["四川省遂寧高新區寶升鎮插板堰村", "山東省淄博市臨淄區齊都鎮安合村委會", ""]for address in address_list:output = process(address)print(output)# {'code': 1, 'info': 'success', 'address_code': '510900000000'}# {'code': 1, 'info': 'success', 'address_code': '370305100000'}# {'code': 1, 'info': '省份和城市信息均未查到', 'address_code': '000000000000'}