使用LoFTR模型進行圖像配準、重疊區提取

LoFTR模型源自2021年CVPR提出的一篇論文LoFTR: Detector-Free Local Feature Matching with Transformers，其基于pytorch實現圖像配準，與基于superpoint+superglue的方法不同，
是一個端到端的圖像配準方法。與LoFTR官方庫相關的有loftr2onnx庫，整體來說loftr2onnx庫使用更方便，效果更好。但loftr2onnx轉出的onnx模型是有問題的，不能使用。

項目地址：https://github.com/zju3dv/LoFTR
項目地址2：https://github.com/oooooha/loftr2onnx
demo體驗：https://huggingface.co/spaces/kornia/Kornia-LoFTR

1、web demo體驗

訪問 https://huggingface.co/spaces/kornia/Kornia-LoFTR

點配準的效果如下所示，可以看到與sp+sg方法相比，點對的平行關系直觀上要好很多。
在這里插入圖片描述

2、使用LoFTR項目

2.1 下載代碼

打開https://github.com/zju3dv/LoFTR 下載代碼
在這里插入圖片描述

2.2 安裝依賴項

安裝項目使用：解壓代碼，然后在終端進入目錄，執行pip install -r .\requirements.txt
在這里插入圖片描述

2.3 下載模型

https://drive.google.com/drive/folders/1DOcOPZb3-5cWxLqn256AhwUVjBPifhuf 下載權重。如果需要訓練，可以下載訓練數據與從測試數據。
在這里插入圖片描述
將權重的壓縮文件解壓后放到項目根目錄下

2.4 初步使用

將 https://hpg123.blog.csdn.net/article/details/124824892 中章節3的代碼（對應標題為 superpoint中read_img_as_tensor函數）保存為imgutils.py

from src.loftr import LoFTR, default_cfg
import torch
from imgutils import *
import time
# Initialize LoFTR
matcher = LoFTR(config=default_cfg)
matcher.load_state_dict(torch.load("weights/indoor_ds_new.ckpt")['state_dict'])
matcher = matcher.eval().cuda()p1=r'C:\Users\hpg\Pictures\t1.jpg'
p2=r'C:\Users\hpg\Pictures\t2.jpg'
t1,im1=read_img_as_tensor(p1,(384,384))
t2,im2=read_img_as_tensor(p2,(384,384))
batch = {'image0': t1, 'image1': t2}
# Inference
with torch.no_grad():matcher(batch)    # t0=time.time()times=10for i in range(times):matcher(batch)rt1=time.time()-t0rt1=rt1/timesmkpts0 = batch['mkpts0_f'].cpu().numpy()mkpts1 = batch['mkpts1_f'].cpu().numpy()mconf = batch['mconf'].cpu().numpy()print(f'運行時間：{rt1:.4f}',mkpts0.shape,mkpts1.shape,mconf)

代碼運行效果如下所示，可以看到一個圖片需要0.19s（筆記本，1060顯卡），換成臺式機3060顯卡，預計在0.05s左右一張圖

運行時間：0.1933 (32, 2) (32, 2) [0.22855578 0.21740437 0.34927088 0.28389925 0.27157754 0.269668280.22636016 0.22058277 0.20475665 0.20878278 0.22838292 0.254485850.27047077 0.34403533 0.22612476 0.2044811  0.26239234 0.327975540.2263804  0.26544347 0.3401669  0.39336586 0.3473139  0.282306940.23061718 0.23949552 0.46178365 0.3540019  0.5322925  0.272002370.26731068 0.39827508]

3、使用loftr2onnx庫進行配準

3.1 基本使用

也可以基于 https://github.com/oooooha/loftr2onnx 項目進行圖像配準
在這里插入圖片描述
使用代碼如下


from loftr_wrapper import LoFTRWrapper as LoFTR
import torch
from imgutils import *
import time
# Initialize LoFTR
matcher = LoFTR()
matcher.load_state_dict(torch.load("weights/indoor_ds_new.ckpt")['state_dict'])
matcher = matcher.eval().cuda()p1=r'C:\Users\hpg\Pictures\t1.jpg'
p2=r'C:\Users\hpg\Pictures\t2.jpg'
t1,im1=read_img_as_tensor(p1,(384,384))
t2,im2=read_img_as_tensor(p2,(384,384))
# Inference
with torch.no_grad():result=matcher(t1,t2)    # t0=time.time()times=10for i in range(times):result=matcher(t1,t2)rt1=time.time()-t0rt1=rt1/timesmkpts0 = result['keypoints0'].cpu().numpy()mkpts1 = result['keypoints1'].cpu().numpy()mconf = result['confidence'].cpu().numpy()print(f'運行時間：{rt1:.4f}',mkpts0.shape,mkpts1.shape,mconf)

代碼輸出如下所示，可以看到與LoFTR項目的輸出有所差異

運行時間：0.1925 (212, 2) (212, 2) [0.4566688  0.53420454 0.5319168  0.5320238  0.46744433 0.40682140.5363396  0.45674214 0.60001785 0.6576139  0.53006035 0.595909240.5725811  0.5505655  0.44364485 0.40315574 0.4293331  0.50609730.6550978  0.52451503 0.553644   0.63088214 0.6906601  0.616680740.4543735  0.4138872  0.4332955  0.47855106 0.60533136 0.67351430.7912271  0.7220486  0.75414115 0.75669855 0.60389113 0.403050660.71130437 0.6583284  0.5403245  0.5433615  0.40149704 0.66738440.4093839  0.5410701  0.51509964 0.42121148 0.68238974 0.552473960.5116625  0.8369319  0.53321654 0.5323315  0.5779519  0.647059260.43591025 0.40134645 0.4599252  0.46620858 0.6388375  0.83547580.515318   0.6521981  0.54744494 0.64528877 0.7466613  0.63595170.58179545 0.4587202  0.4856584  0.42029297 0.43322447 0.432207580.6896481  0.79645556 0.5817581  0.75245494 0.5786756  0.72515590.814531   0.49031648 0.46484298 0.54241467 0.5943087  0.72451150.6457875  0.8097793  0.7199513  0.49220178 0.5443373  0.40861040.5046131  0.7193697  0.6752727  0.41796637 0.5513792  0.70874180.7779165  0.75016826 0.68525094 0.58962977 0.6315668  0.49130850.56355244 0.41288543 0.52281946 0.42782715 0.43921712 0.52160180.5566503  0.78442967 0.6013023  0.42023212 0.43102428 0.615640640.40717542 0.49634054 0.45509326 0.4511342  0.41775596 0.558971760.56803375 0.6018254  0.71239305 0.44001386 0.43651453 0.69477330.8648205  0.4988858  0.40208712 0.71607304 0.9030141  0.55438260.49472648 0.5359598  0.74733096 0.6617334  0.7066015  0.7256770.43446922 0.5126569  0.52367914 0.45096788 0.4248741  0.432852750.723374   0.86523044 0.65740126 0.427191   0.4776224  0.48018260.4530296  0.4275035  0.527438   0.52301216 0.58992577 0.417273430.48609605 0.7365703  0.6339512  0.6379226  0.4489899  0.413250480.5010124  0.49238032 0.57079905 0.62783945 0.5092921  0.57263870.60590863 0.44714844 0.6284152  0.40801758 0.40126294 0.42214190.52245826 0.70989937 0.49206337 0.553483   0.4956581  0.41806970.6228596  0.6543849  0.7747963  0.61180156 0.60290194 0.54211940.6149054  0.48783877 0.40048426 0.47044232 0.40145218 0.423598560.68902797 0.44713116 0.84827214 0.48961237 0.6137104  0.77524260.7184252  0.71058017 0.47483382 0.7151901  0.78853625 0.669882540.7502565  0.42592585 0.49173304 0.4657402  0.59592575 0.428502770.4645101  0.5070625 ]

3.2 差異分析

1、這主要是loftr2onnx項目通過loftr_wrapper對LoFTR的forward流程進行了調整。

#!/usr/bin/env python
import copy
import os
import sys
from typing import Any, Dictimport torch
from einops.einops import rearrange_CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(_CURRENT_DIR, "LoFTR"))from loftr import LoFTR, default_cfgDEFAULT_CFG = copy.deepcopy(default_cfg)
DEFAULT_CFG["coarse"]["temp_bug_fix"] = Trueclass LoFTRWrapper(LoFTR):def __init__(self,config: Dict[str, Any] = DEFAULT_CFG,):LoFTR.__init__(self, config)def forward(self,image0: torch.Tensor,image1: torch.Tensor,) -> Dict[str, torch.Tensor]:data = {"image0": image0,"image1": image1,}del image0, image1data.update({"bs": data["image0"].size(0),"hw0_i": data["image0"].shape[2:],"hw1_i": data["image1"].shape[2:],})if data["hw0_i"] == data["hw1_i"]:  # faster & better BN convergencefeats_c, feats_f = self.backbone(torch.cat([data["image0"], data["image1"]], dim=0))(feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(data["bs"]), feats_f.split(data["bs"])else:  # handle different input shapes(feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(data["image0"]), self.backbone(data["image1"])data.update({"hw0_c": feat_c0.shape[2:],"hw1_c": feat_c1.shape[2:],"hw0_f": feat_f0.shape[2:],"hw1_f": feat_f1.shape[2:],})# 2. coarse-level loftr module# add featmap with positional encoding, then flatten it to sequence [N, HW, C]feat_c0 = rearrange(self.pos_encoding(feat_c0), "n c h w -> n (h w) c")feat_c1 = rearrange(self.pos_encoding(feat_c1), "n c h w -> n (h w) c")mask_c0 = mask_c1 = None  # mask is useful in trainingif "mask0" in data:mask_c0, mask_c1 = data["mask0"].flatten(-2), data["mask1"].flatten(-2)feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0, mask_c1)# 3. match coarse-levelself.coarse_matching(feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1)# 4. fine-level refinementfeat_f0_unfold, feat_f1_unfold = self.fine_preprocess(feat_f0, feat_f1, feat_c0, feat_c1, data)if feat_f0_unfold.size(0) != 0:  # at least one coarse level predictedfeat_f0_unfold, feat_f1_unfold = self.loftr_fine(feat_f0_unfold, feat_f1_unfold)# 5. match fine-levelself.fine_matching(feat_f0_unfold, feat_f1_unfold, data)rename_keys: Dict[str, str] = {"mkpts0_f": "keypoints0","mkpts1_f": "keypoints1","mconf": "confidence",}out: Dict[str, torch.Tensor] = {}for k, v in rename_keys.items():_d = data[k]if isinstance(_d, torch.Tensor):out[v] = _delse:raise TypeError(f"Expected torch.Tensor for item `{k}`. Gotcha {type(_d)}")del datareturn out

2、然后cfg或許有不同

loftr2onnx中的默認配置在loftr\utils\cvpr_ds_config.py中，如下所示。可以看到 _CN.MATCH_COARSE.THR與_CN.MATCH_COARSE.BORDER_RM 是做過修改的，與默認值不同

from yacs.config import CfgNode as CNdef lower_config(yacs_cfg):if not isinstance(yacs_cfg, CN):return yacs_cfgreturn {k.lower(): lower_config(v) for k, v in yacs_cfg.items()}_CN = CN()
_CN.BACKBONE_TYPE = 'ResNetFPN'
_CN.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
_CN.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
_CN.FINE_CONCAT_COARSE_FEAT = True# 1. LoFTR-backbone (local feature CNN) config
_CN.RESNETFPN = CN()
_CN.RESNETFPN.INITIAL_DIM = 128
_CN.RESNETFPN.BLOCK_DIMS = [128, 196, 256]  # s1, s2, s3# 2. LoFTR-coarse module config
_CN.COARSE = CN()
_CN.COARSE.D_MODEL = 256
_CN.COARSE.D_FFN = 256
_CN.COARSE.NHEAD = 8
_CN.COARSE.LAYER_NAMES = ['self', 'cross'] * 4
_CN.COARSE.ATTENTION = 'linear'  # options: ['linear', 'full']
_CN.COARSE.TEMP_BUG_FIX = False# 3. Coarse-Matching config
_CN.MATCH_COARSE = CN()
_CN.MATCH_COARSE.THR = 0.4  # thresh default=0.2
_CN.MATCH_COARSE.BORDER_RM = 4  # border default=2
_CN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'  # options: ['dual_softmax, 'sinkhorn']
_CN.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1
_CN.MATCH_COARSE.SKH_ITERS = 3
_CN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
_CN.MATCH_COARSE.SKH_PREFILTER = True
_CN.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.4  # training tricks: save GPU memory
_CN.MATCH_COARSE.TRAIN_PAD_NUM_GT_MIN = 200  # training tricks: avoid DDP deadlock# 4. LoFTR-fine module config
_CN.FINE = CN()
_CN.FINE.D_MODEL = 128
_CN.FINE.D_FFN = 128
_CN.FINE.NHEAD = 8
_CN.FINE.LAYER_NAMES = ['self', 'cross'] * 1
_CN.FINE.ATTENTION = 'linear'default_cfg = lower_config(_CN)

4、基于LoFTR提取重疊區域

4.1 使用LoFTR庫

這里的imgutils 與前文說明的有細微區別，打開https://blog.csdn.net/a486259/article/details/124824892 將章節1的代碼保存為imgutils.py即可

from src.loftr import LoFTR, default_cfg
import torch
from imgutils import *
import time
# Initialize LoFTR
matcher = LoFTR(config=default_cfg)
matcher.load_state_dict(torch.load("weights/indoor_ds_new.ckpt")['state_dict'])
matcher = matcher.eval().cuda()p1=r'C:\Users\hpg\Pictures\t1.jpg'
p2=r'C:\Users\hpg\Pictures\t2.jpg'
t1,im1=read_img_as_tensor_gray(p1,(384,384))
t2,im2=read_img_as_tensor_gray(p2,(384,384))
batch = {'image0': t1, 'image1': t2}
# Inference
with torch.no_grad():t0=time.time()times=1for i in range(times):matcher(batch)rt1=time.time()-t0rt1=rt1/timesmkpts0 = batch['mkpts0_f'].cpu().numpy()mkpts1 = batch['mkpts1_f'].cpu().numpy()confidence = batch['mconf'].cpu().numpy()print(f'運行時間：{rt1:.4f}',mkpts0.shape,mkpts1.shape)import cv2 as cv
pt_num = mkpts0.shape[0]
im_dst,im_res=im1,im2
img = np.zeros((max(im_dst.shape[0], im_res.shape[0]), im_dst.shape[1]+im_res.shape[1]+10,3))
img[:,:im_res.shape[0],]=im_dst
img[:,-im_res.shape[0]:]=im_res
img=img.astype(np.uint8)
match_threshold=0.6
for i in range(0, pt_num):if (confidence[i] > match_threshold):pt0 = mkpts0[i].to('cpu').numpy().astype(np.int32)pt1 = mkpts1[i].to('cpu').numpy().astype(np.int32)#cv.circle(img, (pt0[0], pt0[1]), 1, (0, 0, 255), 2)#cv.circle(img, (pt1[0], pt1[1]+650), (0, 0, 255), 2)cv.line(img, pt0, (pt1[0]+im_res.shape[0], pt1[1]), (0, 255, 0), 1)
myimshow( img,size=12)import cv2
def getGoodMatchPoint(mkpts0, mkpts1, confidence,  match_threshold:float=0.5):n = min(mkpts0.size(0), mkpts1.size(0))srcImage1_matchedKPs, srcImage2_matchedKPs=[],[]if (match_threshold > 1 or match_threshold < 0):print("match_threshold error!")for i in range(n):kp0 = mkpts0[i]kp1 = mkpts1[i]pt0=(kp0[0].item(),kp0[1].item());pt1=(kp1[0].item(),kp1[1].item());c = confidence[i].item();if (c > match_threshold):srcImage1_matchedKPs.append(pt0);srcImage2_matchedKPs.append(pt1);return np.array(srcImage1_matchedKPs),np.array(srcImage2_matchedKPs)
pts_src, pts_dst=getGoodMatchPoint(mkpts0, mkpts1, confidence)h1, status = cv2.findHomography(pts_src, pts_dst, cv.RANSAC, 8)
im_out1 = cv2.warpPerspective(im_dst, h1, (im_dst.shape[1],im_dst.shape[0]))
im_out2 = cv2.warpPerspective(im_res, h1, (im_dst.shape[1],im_dst.shape[0]),16)
#這里 im_res和im_out1是嚴格配準的狀態
myimshowsCL([im_dst,im_out1,im_res,im_out2],rows=2,cols=2, size=6)

在這里插入圖片描述
代碼運行報錯，因為匹配到的點太少了，無法計算轉化矩陣提取重疊區

4.2 使用loftr2onnx庫

這里的imgutils 與前文說明的有細微區別，打開https://blog.csdn.net/a486259/article/details/124824892 將章節1的代碼保存為imgutils.py即可

使用代碼如下


from loftr_wrapper import LoFTRWrapper as LoFTR
import torch
from imgutils import *
import time
# Initialize LoFTR
matcher = LoFTR()
matcher.load_state_dict(torch.load("weights/indoor_ds_new.ckpt")['state_dict'])
matcher = matcher.eval().cuda()p1=r'C:\Users\hpg\Pictures\t1.jpg'
p2=r'C:\Users\hpg\Pictures\t2.jpg'
t1,im1=read_img_as_tensor_gray(p1,(384,384))
t2,im2=read_img_as_tensor_gray(p2,(384,384))
# Inference
with torch.no_grad():#result=matcher(t1,t2)    # t0=time.time()times=1for i in range(times):result=matcher(t1,t2)rt1=time.time()-t0rt1=rt1/timesmkpts0 = result['keypoints0']#.cpu().numpy()mkpts1 = result['keypoints1']#.cpu().numpy()confidence = result['confidence']#.cpu().numpy()print(f'運行時間：{rt1:.4f}',mkpts0.shape,mkpts1.shape,confidence)import cv2 as cv
pt_num = mkpts0.shape[0]
im_dst,im_res=im1,im2
img = np.zeros((max(im_dst.shape[0], im_res.shape[0]), im_dst.shape[1]+im_res.shape[1]+10,3))
img[:,:im_res.shape[0],]=im_dst
img[:,-im_res.shape[0]:]=im_res
img=img.astype(np.uint8)
match_threshold=0.01
for i in range(0, pt_num):if (confidence[i] > match_threshold):pt0 = mkpts0[i].to('cpu').numpy().astype(np.int32)pt1 = mkpts1[i].to('cpu').numpy().astype(np.int32)#cv.circle(img, (pt0[0], pt0[1]), 1, (0, 0, 255), 2)#cv.circle(img, (pt1[0], pt1[1]+650), (0, 0, 255), 2)cv.line(img, tuple(pt0.tolist()), (pt1[0]+im_res.shape[0], pt1[1]), (0, 255, 0), 1)
myimshow( img,size=12)import cv2
def getGoodMatchPoint(mkpts0, mkpts1, confidence,  match_threshold:float=0.5):n = min(mkpts0.size(0), mkpts1.size(0))srcImage1_matchedKPs, srcImage2_matchedKPs=[],[]if (match_threshold > 1 or match_threshold < 0):print("match_threshold error!")for i in range(n):kp0 = mkpts0[i]kp1 = mkpts1[i]pt0=(kp0[0].item(),kp0[1].item());pt1=(kp1[0].item(),kp1[1].item());c = confidence[i].item();if (c > match_threshold):srcImage1_matchedKPs.append(pt0);srcImage2_matchedKPs.append(pt1);return np.array(srcImage1_matchedKPs),np.array(srcImage2_matchedKPs)
pts_src, pts_dst=getGoodMatchPoint(mkpts0, mkpts1, confidence)h1, status = cv2.findHomography(pts_src, pts_dst, cv.RANSAC, 4)
# im_dst=im_dst.astype(np.float32)/255
# im_res=im_res.astype(np.float32)/255
print(im_dst.shape,im_dst.dtype,im_dst.max(),im_res.shape,im_res.dtype,im_res.max(),h1)
im_out1 = cv2.warpPerspective(im_dst, h1, (im_dst.shape[1],im_dst.shape[0]))
im_out2 = cv2.warpPerspective(im_res, h1, (im_dst.shape[1],im_dst.shape[0]),16)
#這里 im_res和im_out1是嚴格配準的狀態
myimshowsCL([im_dst,im_out1,im_res,im_out2],rows=2,cols=2, size=6)