以?數據集筆記 geolife (操作篇)_geolife數據集-CSDN博客?軌跡為例
1 讀取數據
import pandas as pd
data = pd.read_csv('Geolife Trajectories 1.3/Data//000/Trajectory/20081023025304.plt',header=None, skiprows=6,names=['Latitude', 'Longitude', 'Not_Important1', 'Altitude', 'Not_Important2', 'Date', 'Time'])
data=data[['Latitude', 'Longitude', 'Altitude', 'Date', 'Time']]
data
2 Date和Time 合并
data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])
# to_datetime將這一列轉換成時間
data=data[['Latitude', 'Longitude', 'Altitude', 'Datetime']]
data
3 只保留在北京城區的數據點
3.1 定義經緯度最值
import folium
BEIJING = [39.9, 116.41]
# central beijing coords, for map centres
B1 = 39.8,116.2
# bbox limits for beijing extent
B2 = 40.0 ,116.5
m=folium.Map(location=BEIJING,start_zoom=14)
folium.Marker(B1).add_to(m)
folium.Marker(B2).add_to(m)
m
?3.2 進行地理位置篩選
data=data[(data['Latitude']>B1[0]) & (data['Latitude']<B2[0]) & (data['Longitude']>B1[1]) & (data['Longitude']<B2[1])]
data
3.3 將time gap修改至5秒,保留每個5秒記錄的第一條
data['Datetime_5s']=data['Datetime'].dt.floor('5s')
data
data=data.drop_duplicates(subset=['Datetime_5s'],keep='first')
data
4 去除停止點
data['is_moving'] = (data['Latitude'] != data['Latitude'].shift()) | (data['Longitude'] != data['Longitude'].shift())
#判斷用戶有沒有移動:當前位置和上一位置是否相同data
data=data[data['is_moving']==True]
data=data[['Latitude','Longitude','Datetime_5s']]
data
?
5 將10分鐘內沒有記錄的軌跡切分成兩條軌跡
5.1 計算 time gap
data['time_diff']=data['Datetime_5s'].diff()
data
5.2 切分軌跡
data['split_id']=0
#split_id 將表示這是當前id 切分的第幾段
data
mask=data['time_diff']>pd.Timedelta(minutes=10)
# 時間間隔大于10分鐘的位置,記錄一下
data.loc[mask,'split_id']=1
#這些位置的split_id記為1
data
data['split_id']=data['split_id'].cumsum()
#出現過1的位置,到下一次出現1之前,split_id是一樣的——比前一段多1
data
5.3 得到id
num=0
data['id']=str(num)
data
data['id']=data['id']+'_'+data['split_id'].astype(str)
data
6 計算每一條軌跡的長度,篩選短的,截斷長的
6.1 計算相鄰位置的經緯度差距?
lat_lon_diff = data.groupby('id',group_keys=False).apply(lambda group: group[['Latitude', 'Longitude']].diff())lat_lon_diff
6.2 計算haversine距離的函數
def haversine_distance(lat1, lon1, lat2, lon2):R = 6371 # Earth radius in kilometersdlat = np.radians(lat2 - lat1)dlon = np.radians(lon2 - lon1)a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))return R * c
6.3 計算同一軌跡相鄰位置的距離
import numpy as npdistance = lat_lon_diff.apply(lambda row: haversine_distance(row['Latitude'], row['Longitude'], 0, 0), axis=1)
data['distance']=distance
data
?6.4 計算同一id的累積距離
data['accum_dis']=data.groupby('id')['distance'].cumsum()data
6.5 得到每一個id的軌跡距離
iid=data.groupby('id')['accum_dis'].max()iid=iid.reset_index(name='dis')
iid
6.6 篩選長度大于1km的
iid=iid[iid['dis']>=1]
data=data[data['id'].isin(iid['id'])]
data
6.7 將長度長于10km的軌跡拆分成兩條,并去掉拆分后長度小于1km的
data['split_traj_id']=data['accum_dis']//10
data['split_traj_id']=data['split_traj_id'].fillna(0)
data['split_traj_id']=data['split_traj_id'].astype(int).astype(str)
data
data['id']=data['id']+'_'+data['split_traj_id']
data
去除切分后長度小于1km的:
iid=data.groupby('id')['accum_dis'].max()
iid=iid.reset_index(name='distance')
iid
?
iid=iid[iid['distance']>1]
data=data[data['id'].isin(iid['id'])]
data
7 剔除記錄數量小于10條的軌跡
iid=data.groupby('id').size()
iid=iid.reset_index(name='count')
iid=iid[iid['count']>=10]
iid
data=data[data['id'].isin(iid['id'])]
8 去除“staypoint”
這里的staypoint 意為 最值經緯度對應的距離小于1km
latlon=pd.DataFrame()
latlon['max_lat']=data.groupby('id')['Latitude'].max()
latlon['min_lat']=data.groupby('id')['Latitude'].min()
latlon['max_lon']=data.groupby('id')['Longitude'].max()
latlon['min_lon']=data.groupby('id')['Longitude'].min()
latlon['max_dis']=latlon.apply(lambda row: haversine_distance(row['max_lat'],row['max_lon'],row['min_lat'],row['min_lon']),axis=1)latlon=latlon[latlon['max_dis']>=1]
latlon
data=data[data['id'].isin(latlon.index)]
data