Python 之地址編碼識別

根據輸入地址,利用已有的地址編碼文件,構造處理規則策略識別地址的編碼。

lib/address.json 地址編碼文件(這個文件太大,博客里放不下,需要的話可以到 gitcode 倉庫獲取:https://gitcode.com/TomorrowAndTuture/address_code)

{"110000000000": {"province_name": "北京市","city_datas": {"110100000000": {"city_name": "市轄區","district_datas": {"110101000000": {"district_name": "東城區","town_datas": {"110101001000": {"town_name": "東華門街道","village_datas": {"110101001001": "多福巷社區居委會","110101001002": "銀閘社區居委會","110101001005": "東廠社區居委會","110101001006": "智德社區居委會",
...

main.py

根據輸入的詳細地址,返回該地址的地址編碼(會盡可能查找到更詳盡的編碼)

import json
import re
import traceback
from typing import Dict
import logging
import os
from logging import handlersdef _logging(**kwargs):level = kwargs.pop('level', logging.DEBUG)filename = kwargs.pop('filename', 'default.log')datefmt = kwargs.pop('datefmt', '%Y-%m-%d %H:%M:%S')format = kwargs.pop('format', '[%(asctime)s,%(msecs)d][%(module)s][%(levelname)s] %(lineno)d - %(message)s')log = logging.getLogger(filename)format_str = logging.Formatter(format, datefmt)th = handlers.TimedRotatingFileHandler(filename=filename, when='MIDNIGHT', backupCount=30, encoding="utf-8")th.suffix = "%Y%m%d.log"th.extMatch = re.compile(r"^\d{4}\d{2}\d{2}(\.\w+)?$", re.ASCII)th.setFormatter(format_str)th.setLevel(level)log.addHandler(th)log.setLevel(level)return logroot_dir = os.path.dirname(os.path.abspath(__file__))
lib_dir = os.path.join(root_dir, "lib")
logs_dir = os.path.join(root_dir, "logs")os.makedirs(logs_dir, exist_ok=True)
os.makedirs(lib_dir, exist_ok=True)
logger = _logging(filename="./logs/address.log")address_file_path = os.path.join(lib_dir, "address.json")
# province_file_path = os.path.join(lib_dir, "province.json")  # 省份
# city_file_path = os.path.join(lib_dir, "city.json")  # 城市
# district_file_path = os.path.join(lib_dir, "district.json")  # 區縣
# town_file_path = os.path.join(lib_dir, "town.json")  # 鄉鎮
# village_file_path = os.path.join(lib_dir, "village.json")  # 村鎮class AddressHandler:def __init__(self):self.address_datas: Dict[str] = {}self.province_dict: Dict[str, str] = {}self.city_dict: Dict[str, str] = {}self.district_dict: Dict[str, str] = {}self.town_dict: Dict[str, str] = {}self.village_dict: Dict[str, str] = {}self.load_datas()def load_datas(self):logger.info(f"load address data ...")self.address_datas = json.load(open(address_file_path, encoding='utf-8'))for province_code in self.address_datas:province_info = self.address_datas[province_code]province_name = province_info["province_name"]self.province_dict[province_code] = province_name  # 獲取省份編碼映射city_datas = province_info["city_datas"]for city_code in city_datas:city_info = city_datas[city_code]city_name = city_info["city_name"]self.city_dict[city_code] = city_name  # 獲取城市編碼映射district_datas = city_info["district_datas"]for district_code in district_datas:district_info = district_datas[district_code]district_name = district_info["district_name"]self.district_dict[district_code] = district_name  # 獲取區縣編碼映射town_datas = district_info["town_datas"]for town_code in town_datas:town_info = town_datas[town_code]town_name = town_info["town_name"]self.town_dict[town_code] = town_name  # 獲取區縣編碼映射village_datas = town_info["village_datas"]for village_code in village_datas:village_name = village_datas[village_code]self.village_dict[village_code] = village_namedef get_province_info(self, address: str):for code, name in self.province_dict.items():tmp_name = str(name).replace("省", "").replace("市", "").replace("自治區", "").replace("維吾爾", "").replace("壯族", "").replace("回族", "")if address.startswith(tmp_name):return code, namereturn '', ''def get_city_info(self, address: str, province_code: str = ""):if province_code:code_prefix = province_code[:2]for code, name in self.city_dict.items():tmp_name = str(name).replace("市", "").replace("自治州", "")if tmp_name in address and code.startswith(code_prefix):return code, nameelse:for code, name in self.city_dict.items():tmp_name = str(name).replace("市", "").replace("自治州", "")if tmp_name in address:return code, namereturn '', ''def get_district_info(self, address: str, province_code: str):code_prefix = province_code[:2]for code, name in self.province_dict.items():if name in address and code.startswith(code_prefix):return code, namereturn '', ''@staticmethoddef replace_folk(town: str):folks = ["回族", "滿族", "蒙古族", "俄羅斯族", "朝鮮族", "傈僳族", "錫伯族", "達斡爾族", "柯爾克孜族", "鄂倫春族","畬族", "土家族", "侗族", "瑤族", "苗族", "維吾爾族", "白族", "壯族", "仫佬族", "仡佬族", "彝族", "藏族","羌族", "傣族", "納西族", "白族", "水族", "毛南族", "普米族", "哈尼族", "佤族", "拉祜族", "德昂族", "布朗族","基諾族", "阿昌族", "怒族", "東鄉族", "土族", "哈薩克族", "塔吉克族"]for folk in folks:town = town.replace(folk, "")return towndef get_district_town_village_info(self, district_datas, tmp_split_after_city_address, city_code):district_code = ""district_name = ""town_code = ""town_name = ""village_code = ""village_name = ""logger.info(f"遞進查詢區縣、鄉鎮和村鎮信息")for k1, v1 in district_datas.items():base_district_name = v1["district_name"]if not base_district_name:continuetmp_base_district_name = base_district_nameif tmp_base_district_name in tmp_split_after_city_address and k1.startswith(city_code[:4]):district_code = k1district_name = base_district_name# 獲取區縣后部分地址tmp_split_after_district_address = tmp_split_after_city_address.split(tmp_base_district_name, 1)[-1]town_datas = v1.get("town_datas", {})for k2, v2 in town_datas.items():base_town_name = v2["town_name"]tmp_base_town_name = base_town_nametmp_base_town_name = self.replace_folk(tmp_base_town_name)if len(tmp_base_town_name) <= 1:tmp_base_town_name = base_town_nameif tmp_base_town_name in tmp_split_after_district_address and k2.startswith(district_code[:6]):town_code = k2town_name = base_town_name# 獲取鄉鎮后部分地址tmp_split_after_town_address = tmp_split_after_district_address.split(tmp_base_town_name, 1)[-1]village_datas = v2.get("village_datas", {})for k3, v3 in village_datas.items():base_village_name = str(3)# 去掉額外字符,提高村鎮識別精度tmp_base_village_name = base_village_name \.replace("村村民委員會", "村") \.replace("村民委員會", "村") \.replace("村委會", "村") \.replace("村村民居委會", "村") \.replace("社區居委會", "社區") \.replace("居民委員會", "") \.replace("居委會", "") \.replace("委員會", "") \.replace("委會", "")if len(tmp_base_village_name) <= 1:tmp_base_village_name = base_village_nameif tmp_base_village_name in tmp_split_after_town_address and k3.startswith(town_code[:9]):village_code = k3village_name = base_village_namereturn district_name, district_code, town_name, town_code, village_name, village_codereturn district_name, district_code, town_name, town_code, village_name, village_codedef handle_address(self, address: str):status_code = 1info = "success"address_code = "000000000000"logger.info(f"地址:{address}")province_code, province_name = self.get_province_info(address)if not province_code:logger.info("未查詢到省份信息,先略過省份查詢,優先查詢城市信息")city_code, city_name = self.get_city_info(address_code)if city_code:province_code = city_code[:2] + '0' * 10province_name = self.province_dict.get(province_code)logger.info(f"優先查詢到城市:{city_name}")else:info = "省份和城市信息均未查到"logger.info(info)return status_code, info, address_codelogger.info(f"解析到省份:{province_name}")address_code = province_codeprovince_datas = self.address_datas.get(province_code, {})city_datas = province_datas.get("city_datas", {})city_code, city_name = self.get_city_info(address, province_code)if not city_code:logger.info(f"未查詢到城市信息,先略過城市查詢,優先查詢區縣信息")district_code, district_name = self.get_district_info(address, province_code)if district_code:city_code = district_code[:4] + '0' * 8city_name = self.city_dict.get(city_code)logger.info(f"優先查詢到曲線:{district_name}")else:info = "城市和區縣均未查到"logger.info(info)return status_code, info, address_codelogger.info(f"解析到城市:{city_name}")tmp_city_name = city_name.replace("市", "").replace("自治州", "").replace("地區", "")address_code = city_codetmp_split_after_city_address = address.split(tmp_city_name, 1)[-1]district_datas = city_datas.get(city_code, {}).get("district_datas", {})district_name, district_code, town_name, town_code, village_name, village_code = self.get_district_town_village_info(district_datas, tmp_split_after_city_address, city_code)logger.info(f"區縣:{district_name or None};鄉鎮:{town_name or None};村鎮:{village_name or None}")# 如果遞進未匹配到鄉鎮,則再正則匹配查找if town_name == "":logger.info(f"鄉鎮信息未查到,繼續使用正則表達式再次匹配查找")address_match_pattern = r"(.*?省|.*?自治區|.*?市)?(.*?市|.*?自治州)?(.*?區|.*?縣)?(.*?鎮|.*?鄉|.*?街道|.*?街|.*?辦事處)?(.*?村|.*?社區|.*?路)?"search_obj = re.search(address_match_pattern, address)if search_obj:match_town_name = search_obj.group(4)match_village_name = search_obj.group(5)if match_town_name and town_name == "":town_name = match_town_nameif match_village_name and village_name == "":village_name = match_village_namelogger.info(f"正則匹配到鄉鎮和村鎮信息:{match_town_name};{match_village_name}")# 將正則匹配到的鄉鎮和村鎮與編碼表中的數據進行比對查找if match_town_name:for code, name in self.town_dict.items():if not name:continueif district_code:if not code[0:6] == district_code[0:6]:continueelse:if not code[0:4] == city_code[0:4]:continuetmp_name = nameif len(tmp_name) <= 1:tmp_name = nameif tmp_name in match_town_name or match_town_name in tmp_name:logger.info(f"正則查找到鄉鎮信息:{name}")town_code = codetown_name = namebreakif match_village_name:for code, name in self.village_dict.items():if not name:continueif town_code:if not code[0:9] == town_code[0:9]:continueelif district_code:if not code[0:6] == district_code[0:6]:continueelse:if not code[0:4] == city_code[0:4]:continuetmp_name = name \.replace("村村民委員會", "村") \.replace("村民委員會", "村") \.replace("村委會", "村") \.replace("村村民居委會", "村") \.replace("社區居委會", "社區") \.replace("居民委員會", "") \.replace("居委會", "") \.replace("委員會", "") \.replace("委會", "")if len(tmp_name) <= 1:tmp_name = nameif tmp_name in match_village_name or match_village_name in tmp_name:logger.info(f"正則查找到村鎮信息:{name}")village_code = codevillage_name = namebreak# 反向替換if not town_code and village_code:town_code = village_code[0:9] + "0" * 3town_name = self.town_dict.get(town_code)logger.info(f"反向查找到鄉鎮:{town_name}")if not district_code and town_code:district_code = town_code[0:6] + "0" * 6district_name = self.district_dict.get(district_code)logger.info(f"反向查找到區縣:{district_name}")address_code = village_code or town_code or district_code or city_code or address_codelogger.info(f"{province_name}: {province_code}, {city_name}: {city_code}, {district_name}: {district_code}, {town_name}: {town_code}, {village_name}: {village_code}")return status_code, info, address_codeaddress_handle = AddressHandler()def process(address):logger.info("========== start process ==========")output_data = {"code": -1,"info": "fail","address_code": "",}try:status_code, info, address_code = address_handle.handle_address(address)output_data["code"] = status_codeoutput_data["info"] = infooutput_data["address_code"] = address_codeexcept Exception as e:logger.error(traceback.format_exc())output_data["code"] = -1output_data["info"] = f"fail: {e}"finally:logger.info(f"output_data: {output_data}")logger.info("========== end process ==========")return output_dataif __name__ == '__main__':address_list = ["四川省遂寧高新區寶升鎮插板堰村", "山東省淄博市臨淄區齊都鎮安合村委會", ""]for address in address_list:output = process(address)print(output)# {'code': 1, 'info': 'success', 'address_code': '510900000000'}# {'code': 1, 'info': 'success', 'address_code': '370305100000'}# {'code': 1, 'info': '省份和城市信息均未查到', 'address_code': '000000000000'}

本文來自互聯網用戶投稿,該文觀點僅代表作者本人,不代表本站立場。本站僅提供信息存儲空間服務,不擁有所有權,不承擔相關法律責任。
如若轉載,請注明出處:http://www.pswp.cn/web/89713.shtml
繁體地址,請注明出處:http://hk.pswp.cn/web/89713.shtml
英文地址,請注明出處:http://en.pswp.cn/web/89713.shtml

如若內容造成侵權/違法違規/事實不符,請聯系多彩編程網進行投訴反饋email:809451989@qq.com,一經查實,立即刪除!

相關文章

kafka的部署

目錄 一、kafka簡介 1.1、概述 1.2、消息系統介紹 1.3、點對點消息傳遞模式 1.4、發布-訂閱消息傳遞模式 二、kafka術語解釋 2.1、結構概述 2.2、broker 2.3、topic 2.4、producer 2.5、consumer 2.6、consumer group 2.7、leader 2.8、follower 2.9、partition…

小語種OCR識別技術實現原理

小語種OCR&#xff08;光學字符識別&#xff09;技術的實現原理涉及計算機視覺、自然語言處理&#xff08;NLP&#xff09;和深度學習等多個領域的融合&#xff0c;其核心目標是讓計算機能夠準確識別并理解不同語言的印刷或手寫文本。以下是其關鍵技術實現原理的詳細解析&#…

GPT:讓機器擁有“創造力”的語言引擎

當ChatGPT寫出莎士比亞風格的十四行詩&#xff0c;當GitHub Copilot自動生成編程代碼&#xff0c;背后都源于同一項革命性技術——**GPT&#xff08;Generative Pre-trained Transformer&#xff09;**。今天&#xff0c;我們將揭開這項“語言魔術”背后的科學原理&#xff01;…

LeetCode|Day19|14. 最長公共前綴|Python刷題筆記

LeetCode&#xff5c;Day19&#xff5c;14. 最長公共前綴&#xff5c;Python刷題筆記 &#x1f5d3;? 本文屬于【LeetCode 簡單題百日計劃】系列 &#x1f449; 點擊查看系列總目錄 >> &#x1f4cc; 題目簡介 題號&#xff1a;14. 最長公共前綴 難度&#xff1a;簡單…

安全事件響應分析--基礎命令

----萬能密碼oror1 or # 1or11 1 or 11安全事件響應分析------***windoes***------方法開機啟動有無異常文件 【開始】?【運行】?【msconfig】文件排查 各個盤下的temp(tmp)相關目錄下查看有無異常文件 &#xff1a;Windows產生的 臨時文件 可以通過查看日志且通過篩…

基于C#+SQL Server實現(Web)學生選課管理系統

學生選課管理系統的設計與開發一、項目背景學生選課管理系統是一個學校不可缺少的部分&#xff0c;傳統的人工管理檔案的方式存在著很多的缺點&#xff0c;如&#xff1a;效率低、保密性差等&#xff0c;所以開發一套綜合教務系統管理軟件很有必要&#xff0c;它應該具有傳統的…

垃圾回收(GC)

內存管理策略&#xff0c;在業務進程運行的過程中&#xff0c;由垃圾收集器以類似守護協程的方式在后臺運行&#xff0c;按照指定策略回收不再被使用的對象&#xff0c;釋放內存空間進行回收 優勢&#xff1a; 屏蔽內存回收的細節&#xff1a;屏蔽復雜的內存管理工作&#xff0…

Datawhale AI夏令營-機器學習

比賽簡介 「用戶新增預測挑戰賽」是由科大訊飛主辦的一項數據科學競賽&#xff0c;旨在通過機器學習方法預測用戶是否為新增用戶 比賽屬于二分類任務&#xff0c;評價指標采用F1分數&#xff0c;分數越高表示模型性能越好。 如果你有一份帶標簽的表格型數據&#xff0c;只要…

Spring IOC容器在Web環境中是如何啟動的(源碼級剖析)?

文章目錄一、Web 環境中的 Spring MVC 框架二、Web 應用部署描述配置傳統配置&#xff08;web.xml&#xff09;&#xff1a;Java配置類&#xff08;Servlet 3.0&#xff09;&#xff1a;三、核心啟動流程詳解1. 啟動流程圖2. ★容器初始化入口&#xff1a;ContextLoaderListene…

18個優質Qt開源項目匯總

1&#xff0c;Clementine Music Player Clementine Music Player 是一個功能完善、跨平臺的開源音樂播放器&#xff0c;非常適合用于學習如何開發媒體類應用&#xff0c;尤其是跨平臺桌面應用。它基于 Qt 框架開發&#xff0c;支持多種操作系統&#xff0c;包括 Windows、macO…

計算機視覺:AI 的 “眼睛” 如何看懂世界?

1. 什么是計算機視覺&#xff1a;讓機器 “看見” 并 “理解” 的技術1.1 計算機視覺的核心目標計算機視覺&#xff08;CV&#xff09;是人工智能的一個重要分支&#xff0c;它讓計算機能夠 “看懂” 圖像和視頻 —— 不僅能捕捉像素信息&#xff0c;還能分析內容、提取語義&am…

華為OD刷題記錄

華為OD刷題記錄 刷過的題 入門 1、進制 2、NC61 doing 訂閱專欄

QT學習教程(二十五)

雙緩沖技術&#xff08;Double Buffering&#xff09;&#xff08; 2、公有函數實現&#xff09;#include <QtGui> #include <cmath> using namespace std; #include "plotter.h"以上代碼為文件的開頭&#xff0c;在這里把std 的名空間加入到當前的全…

設計模式筆記_結構型_裝飾器模式

1.裝飾器模式介紹裝飾器模式是一種結構型設計模式&#xff0c;允許你動態地給對象添加行為&#xff0c;而無需修改其代碼。它的核心思想是將對象放入一個“包裝器”中&#xff0c;這個包裝器提供了額外的功能&#xff0c;同時保持原有對象的接口不變。想象一下&#xff0c;你有…

day25 力扣90.子集II 力扣46.全排列 力扣47.全排列 II

子集II給你一個整數數組 nums &#xff0c;找出并返回所有該數組中不同的遞增子序列&#xff0c;遞增子序列中 至少有兩個元素 。你可以按 任意順序 返回答案。數組中可能含有重復元素&#xff0c;如出現兩個整數相等&#xff0c;也可以視作遞增序列的一種特殊情況。示例 1&…

Solidity 中的`bytes`

在 Solidity 中&#xff0c;bytes 和 bytes32 都是用來保存二進制數據的類型&#xff0c;但它們的長度、使用場景、Gas 成本完全不同。? 一句話區分類型一句話總結bytes32定長 32 字節&#xff0c;適合做哈希、地址、標識符等固定長度數據。bytes動態長度字節數組&#xff0c;…

初學者STM32—PWM驅動電機與舵機

一、簡介 上一節課主要學習了輸出比較和PWM的基本原理和結構&#xff0c;本節課就主要以實踐為主通過STM32最小系統板和驅動器控制舵機和直流電機。 上一節課的坐標 初學者STM32—輸出比較與PWM-CSDN博客 二、舵機 舵機是一種根據輸入PWM信號占空比來控制輸出角度的裝置 輸…

C++中的異常處理機制:try-catch

一、基本概念 異常&#xff08;Exception&#xff09;&#xff1a;程序執行過程中發生的非正常情況&#xff0c;比如除以零、訪問越界、內存不足等。 異常處理&#xff08;Exception Handling&#xff09;&#xff1a;對異常情況進行捕獲、分析&#xff0c;并采取補救措施&…

如何從 Windows 11 或 10 遠程訪問 Ubuntu 24.04 或 22.04 桌面

了解如何使用 RDP(遠程桌面協議)從 Windows 11 或 10 遠程連接 Ubuntu 24.04 Noble 或 22.04 LTS Jammy JellyFish 桌面的步驟。 Windows 提供了一個便捷的功能,稱為遠程桌面連接,它使用 RDP 協議來遠程連接 PC。當從 Windows 系統建立遠程桌面連接時,使用起來非常簡單,…

Linux 服務器中,Tab 鍵自動補全功能失效

在 Linux 服務器中&#xff0c;Tab 鍵自動補全功能失效通常與 bash-completion 組件缺失或配置異常有關。以下是解決問題的兩個關鍵 YUM 指令及操作步驟&#xff1a;1. 安裝 bash-completion 組件 sudo yum install -y bash-completion說明&#xff1a; bash-completion 是提供…