代码拉取完成,页面将自动刷新
import os,sqlite3,cv2
import numpy as np
import uuid
from qgis.PyQt.QtWidgets import QApplication
from .utils import cosine_similarity,correlation_coefficient,mutual_info,print_log
class LabelItem:
def __init__(self,title,id,superid,cc,desc,rules):
self.title=title
self.id=id
self.superid=superid
self.cc=cc
self.desc=desc
self.reshape_rule=rules
class LabelSample:
def __init__(self,labelid=-1,samp_img=None,gebd=None,lebd=None):
self.labelid=labelid
self.samp_img = samp_img
self.gebd=gebd
self.lebd1=lebd
def set_id(self,fid):
self.fid=fid
def set_histogram(self,ghis,lhis):
#print_log("Befor set Hist:",ghis.dtype,lhis.dtype)
self.ghis=ghis
self.lhis=lhis
def set_source(self,src_layer,src_featid):
self.src_layer=src_layer
self.src_featid=src_featid
def set_sizeinfo(self,mask_sizex,mask_sizey,samp_len):
self.samp_len=samp_len
self.mask_sizex=mask_sizex
self.mask_sizey=mask_sizey
def set_entropy(self,samp_entropy,parcel_entropy):
self.samp_entropy=samp_entropy
self.parcel_entropy=parcel_entropy
def set_position(self,longitude,latitude):
self.longitude=longitude
self.latitude=latitude
def set_simpca(self,gsim_pca,lsim_pca):
self.gsim_pca=gsim_pca
self.lsim_pca=lsim_pca
def set_image_meta(self,img_date,img_resolution,img_platform,samp_resolution):
self.img_date=img_date
self.img_resolution=img_resolution
self.img_platform=img_platform
self.samp_resolution=samp_resolution
def set_selfsim(self,glsim,glhissim):
self.glsim=glsim
self.glhissim=glhissim
class LabelBase(object):
def __init__(self,baseFile):
self.baseFile=baseFile
self.connection=None
self.labelItems=[]
self.guid=None
self.interruptLongTransaction=False
if baseFile is not None and baseFile.strip()!='':
if not os.path.exists(baseFile):
self.initLabelBase(baseFile)
else:
self.getLabelItems()
def __del__(self):
self.close()
def initLabelBase(self,filepath):
self.connect()
try:
cursor = self.connection.cursor()
cursor.execute('''CREATE TABLE meta(
fid integer PRIMARY KEY AUTOINCREMENT,
guid varchar(64) NOT NULL,
author character(16),
org varchar(128),
desc varchar(255),
created datetime DEFAULT CURRENT_TIMESTAMP,
updated datetime DEFAULT CURRENT_TIMESTAMP
)''')
self.connection.commit()
cursor.execute('''CREATE TABLE labels(
fid integer PRIMARY KEY AUTOINCREMENT,
superid integer DEFAULT -1,
title varchar(64) NOT NULL,
description varchar(255),
reshape_rule varchar(255) DEFAULT "SimplifyPreserveTopology",
cc character(10),
used_times integer DEFAULT 0,
created datetime DEFAULT CURRENT_TIMESTAMP,
updated datetime DEFAULT CURRENT_TIMESTAMP
)''')
self.connection.commit()
cursor.execute('''CREATE TABLE samples(
fid integer PRIMARY KEY AUTOINCREMENT,
labelid integer NOT NULL,
src_layer varchar(64),
src_featid integer,
gebd Blob,
lebd1 Blob,
lebd2 Blob,
ghis Blob,
lhis Blob,
mask_sizex integer,
mask_sizey integer,
samp_img Blob,
samp_len integer,
img_date Date,
img_resolution Real,
img_platform character(10),
samp_resolution Real,
samp_entropy Real,
parcel_entropy Real,
gsim_pca Real DEFAULT 0,
lsim_pca Real DEFAULT 0,
glsim Real DEFAULT 0,
glhissim Real DEFAULT 0,
latitude Real,
longitude Real,
gsim_agents Real DEFAULT 0,
lsim_agents Real DEFAULT 0,
catch_on datetime DEFAULT CURRENT_TIMESTAMP
)''')
self.connection.commit()
cursor.execute('''CREATE TABLE ebdpca(
fid integer PRIMARY KEY AUTOINCREMENT,
labelid integer NOT NULL,
gpca Blob,
lpca Blob,
gscore Real,
lscore Real,
resolution Real,
extent character(128),
gagents Blob,
lagents Blob,
gagent_count smallint,
lagent_count smallint
)''')
self.connection.commit()
cursor.execute('''CREATE TABLE similarity(
fid integer PRIMARY KEY AUTOINCREMENT,
fromsampid integer NOT NULL,
withsampid integer NOT NULL,
ggsim Real DEFAULT 0,
glsim Real DEFAULT 0,
llsim Real DEFAULT 0,
lgsim Real DEFAULT 0,
ggcorrcoef Real DEFAULT 0,
glcorrcoef Real DEFAULT 0,
llcorrcoef Real DEFAULT 0,
lgcorrcoef Real DEFAULT 0,
mutualinfo Real DEFAULT 0,
gghissim Real DEFAULT 0,
glhissim Real DEFAULT 0,
llhissim Real DEFAULT 0,
lghissim Real DEFAULT 0
)''')
self.connection.commit()
cursor.execute('''create view samp_sim AS
select distinct fromsample.*,withsample.*,ggsim,glsim,llsim,lgsim,ggcorrcoef, glcorrcoef,llcorrcoef,lgcorrcoef,mutualinfo
from (
select samples.fid as fromsampid,labels.title as fromtitle,samples.mask_sizex as fromx,
samples.mask_sizey as fromy,samples.samp_len as fromlen,glsim as fromselfsim,samp_entropy as from_entropy
from labels,samples
where labels.fid=samples.labelid
) as fromsample,
(
select samples.fid as withsampid,labels.title as withtitle,samples.mask_sizex as withx,
samples.mask_sizey as withy,samples.samp_len as withlen,glsim as withlfsim,samp_entropy as with_entropy
from labels,samples
where labels.fid=samples.labelid
) as withsample,
similarity
where similarity.fromsampid=fromsample.fromsampid and similarity.withsampid=withsample.withsampid
order by llsim desc,fromtitle,fromsampid,withtitle,withsampid
''')
self.connection.commit()
cursor.close()
except Exception as e:
print_log(e)
print_log("Create table failed")
finally:
self.connection.close()
def connect(self):
if not self.is_connected():
# if os.path.exists(self.baseFile):
if self.baseFile is not None and self.baseFile.strip()!="":
self.connection= sqlite3.connect(self.baseFile)
return True
else:
self.connection=None
return False
else:
return True
def close(self):
if self.connection:
self.connection.close()
def is_connected(self):
try:
if self.connection:
self.connection.execute("SELECT * FROM labels LIMIT 1")
return True
else:
return False
except sqlite3.ProgrammingError as e:
return False
def logMetaInfo(self,author,org,description="",close_connect=True):
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select updated from labels order by updated DESC limit 1")
updateTime = lblcur.fetchone()
lblcur.execute("select guid from meta")
guid = lblcur.fetchone()
if guid:
self.guid=guid[0]
if updateTime:
if description:
sql ="update meta set author='{}',org='{}',desc='{}',updated='{}'".format(author, org,description,updateTime[0])
else:
sql ="update meta set author='{}',org='{}',updated='{}'".format(author, org,updateTime[0])
else:
if description:
sql = "update meta set author='{}',org='{}',desc='{}'".format(author, org,description)
else:
sql = "update meta set author='{}',org='{}'".format(author, org)
lblcur.execute(sql)
else:
self.guid=str(uuid.uuid1())
lblcur.execute("insert into meta(guid,author,org,desc) values(?,?,?,?)", (self.guid, author, org,description))
self.connection.commit()
lblcur.close()
if close_connect:
self.close()
def updateMetaTime(self,close_connect=True):
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select updated from labels order by updated DESC limit 1")
updateTime = lblcur.fetchone()[0]
if updateTime:
lblcur.execute("update meta set updated='{}'".formate(updateTime))
self.connection.commit()
lblcur.close()
if close_connect:
self.close()
def getMetaInfo(self,close_connect=True):
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select guid,author,org,desc,created,updated from meta")
meta = lblcur.fetchone()
lblcur.close()
if close_connect:
self.close()
if meta:
self.guid=meta[0]
else:
self.guid=None
return meta
def getLabelItems(self,close_connect=True):
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select title,fid,superid,cc,description,reshape_rule from labels")
labels = lblcur.fetchall()
lblcur.close()
self.labelItems.clear()
for label in labels:
rules=label[5].split(",") if label[5] else []
self.labelItems.append(LabelItem(label[0], label[1], label[2], label[3], label[4],rules))
if close_connect:
self.close()
def getLabelItemById(self,id):
for item in self.labelItems:
if item.id==id:
return item
return None
def incedentLabelUsedTimes(self,id,times=1,close_connect=True):
try:
#print_log("update labels set used_times= used_times+{} where fid={}".format(times,id))
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("update labels set used_times= used_times+{} where fid={}".format(times,id))
self.connection.commit()
except sqlite3.Error as error:
print_log("Failed to update used_times.", error)
finally:
lblcur.close()
if close_connect:
self.close()
def getLabelItemByTitle(self,title):
for item in self.labelItems:
if item.title==title:
return item
return None
def getLabelSamplesByLabelId(self,labelid,limit=5,close_connect=True):
if not self.connect(): return
labelSamples=[]
lblcur = self.connection.cursor()
sql="select fid,gebd,lebd1,samp_img,samp_entropy,longitude,latitude from samples where labelid={}".format(labelid)
sql=sql + " order by random()"
if limit>0:
sql = sql + " limit "+str(limit)
#print_log(sql)
lblcur.execute(sql)
samples = lblcur.fetchall()
lblcur.close()
for sample in samples:
labelSample=LabelSample()
labelSample.labelid=labelid
labelSample.id=sample[0]
labelSample.gebd =np.frombuffer(sample[1],dtype=np.float32).reshape((1, -1))
labelSample.lebd1=np.frombuffer(sample[2],dtype=np.float32).reshape((1, -1))
#labelSample.lebd2=np.frombuffer(sample[3],dtype=np.float32).reshape((1, -1))
labelSample.samp_img= cv2.imdecode(np.frombuffer(sample[3],dtype=np.uint8),cv2.IMREAD_COLOR)
labelSample.samp_entropy = sample[4]
labelSample.longitude = sample[5]
labelSample.latitude = sample[6]
labelSamples.append(labelSample)
if close_connect:
self.close()
return labelSamples
def getLabelSampleById(self,sampleid,close_connect=True):
if not self.connect(): return None
lblcur = self.connection.cursor()
sql="select fid,labelid,gebd,lebd1,samp_img,ghis,lhis,samp_entropy from samples where fid={}".format(sampleid)
lblcur.execute(sql)
sample = lblcur.fetchone()
lblcur.close()
labelSample=LabelSample()
labelSample.id=sample[0]
labelSample.labelid = sample[1]
labelSample.gebd =np.frombuffer(sample[2],dtype=np.float32).reshape((1, -1))
labelSample.lebd1=np.frombuffer(sample[3],dtype=np.float32).reshape((1, -1))
#labelSample.lebd2=np.frombuffer(sample[3],dtype=np.float32).reshape((1, -1))
labelSample.samp_img= cv2.imdecode(np.frombuffer(sample[4],dtype=np.uint8),cv2.IMREAD_COLOR)
labelSample.ghis =np.frombuffer(sample[5],dtype=np.single).reshape(-1)
labelSample.lhis=np.frombuffer(sample[6],dtype=np.single).reshape(-1)
labelSample.samp_entropy = sample[7]
if close_connect:
self.close()
return labelSample
def getLabelSampleByLayerAndFeatid(self,layername,featid,close_connect=True):
if not self.connect(): return None
lblcur = self.connection.cursor()
sql="select fid,labelid,gebd,lebd1,samp_img,ghis,lhis,samp_entropy,longitude,latitude from samples where src_layer='{}' and src_featid={}".format(layername,featid)
lblcur.execute(sql)
sample = lblcur.fetchone()
lblcur.close()
if sample:
labelSample=LabelSample()
labelSample.id=sample[0]
labelSample.labelid = sample[1]
labelSample.gebd =np.frombuffer(sample[2],dtype=np.float32).reshape((1, -1))
labelSample.lebd1=np.frombuffer(sample[3],dtype=np.float32).reshape((1, -1))
#labelSample.lebd2=np.frombuffer(sample[3],dtype=np.float32).reshape((1, -1))
labelSample.samp_img= cv2.imdecode(np.frombuffer(sample[4],dtype=np.uint8),cv2.IMREAD_COLOR)
labelSample.ghis =np.frombuffer(sample[5],dtype=np.single).reshape(-1)
labelSample.lhis=np.frombuffer(sample[6],dtype=np.single).reshape(-1)
labelSample.samp_entropy = sample[7]
labelSample.longitude = sample[8]
labelSample.latitude = sample[9]
if close_connect:
self.close()
return labelSample
else:
return None
def getOddSamplesOfLabel(self,layername,sampleid,labelid,sim_threshhold,close_connect=True):
'''
获取与指定sample同一label但相似性低于sim_threshhold的其他从layername中获取的samples
'''
if not self.connect(): return
lblcur = self.connection.cursor()
#sql="select withsampid,llsim, from similarity where fromsampid={}".format(sampleid)
sql="select withsampid as sampid,labelid,llsim,longitude,latitude from similarity,samples\
where fromsampid={} and llsim<{} and samples.fid=similarity.withsampid and samples.src_layer='{}'\
and samples.labelid={} order by llsim desc".format(sampleid,sim_threshhold,layername,labelid)
print_log(sql)
lblcur.execute(sql)
odd_samples = lblcur.fetchall()
if not odd_samples:
self.update_sim_with_sameLabeled(sampleid,overwrite=True,close_connect=False)
lblcur.execute(sql)
odd_samples = lblcur.fetchall()
lblcur.close()
if close_connect:
self.close()
return odd_samples
def getSimSamplesOfOtherLabel(self,layername,sampleid,labelid,sim_threshhold,close_connect=True):
'''
获取与指定sample相似(相似性高于sim_threshhold)但属于不同label的其他从layername中获取的samples
'''
if not self.connect(): return
lblcur = self.connection.cursor()
#sql="select withsampid,llsim, from similarity where fromsampid={}".format(sampleid)
sql="select withsampid as sampid,labelid,llsim,longitude,latitude from similarity,samples\
where fromsampid={} and llsim>={} and samples.fid=similarity.withsampid and samples.src_layer='{}'\
and samples.labelid<>{} order by llsim desc".format(sampleid,sim_threshhold,layername,labelid)
print_log(sql)
lblcur.execute(sql)
odd_samples = lblcur.fetchall()
lblcur.close()
if close_connect:
self.close()
return odd_samples
def getLabelSamplesByLabelTitle(self,title,limit=5,orderby=None,close_connect=True):
labelItem=self.getLabelItemByTitle(title,close_connect)
if labelItem:
return self.getLabelSamplesByLabelId(labelItem.id,limit,orderby,close_connect)
else:
return None
def delLabelSamplesOfFeature(self,layername,featid,close_connect=True):
'''
删除图层中指定feature对应的sample
'''
if not self.connect(): return None
try:
lblcur = self.connection.cursor()
sql="delete from samples where src_layer='{}' and src_featid={}".format(layername,featid)
#print_log(sql)
lblcur.execute(sql)
self.connection.commit()
lblcur.close()
except sqlite3.Error as error:
print_log("Failed to delete item: ", error)
finally:
if close_connect:
self.close()
def insertLabelItem(self,labelItem,close_connect=True):
if not self.connect(): return
try:
lblcur = self.connection.cursor()
lblcur.execute("insert into labels(title,superid,cc,description,reshape_rule) values(?,?,?,?,?)",
(labelItem.title,labelItem.superid,labelItem.cc,labelItem.desc,','.join(labelItem.reshape_rule)))
self.connection.commit()
id=lblcur.lastrowid
lblcur.close()
#print_log("The id of the inserted row :", id)
return id
except sqlite3.Error as error:
print_log("Failed to insert item into table", error)
return -1
finally:
if close_connect:
self.close()
self.getLabelItems(close_connect)
def updateLabelItem(self,labelItem,close_connect=True):
if not self.connect(): return
try:
lblcur = self.connection.cursor()
sql="update labels set title='{}',superid={},cc='{}',description='{}',reshape_rule='{}' where fid={}".format(
labelItem.title,labelItem.superid,labelItem.cc,labelItem.desc,",".join(labelItem.reshape_rule),labelItem.id)
#print_log(sql)
lblcur.execute("update labels set title='{}',superid={},cc='{}',description='{}',reshape_rule='{}' where fid={}".format(
labelItem.title,labelItem.superid,labelItem.cc,labelItem.desc,",".join(labelItem.reshape_rule),labelItem.id))
self.connection.commit()
lblcur.close()
#print_log("Total number of rows updated :", self.connection.total_changes)
except sqlite3.Error as error:
print_log("Failed to update item into table", error)
finally:
if close_connect:
self.close()
self.getLabelItems(close_connect)
def delteLabelItem(self,labelid,close_connect=True):
if not self.connect(): return
try:
lblcur = self.connection.cursor()
lblcur.execute("delete from labels where fid={}".format(labelid))
self.connection.commit()
lblcur.close()
#print_log("Total number of rows deleted :", self.connection.total_changes)
except sqlite3.Error as error:
print_log("Failed to delete item: ", error)
finally:
if close_connect:
self.close()
self.getLabelItems(close_connect)
def getMostUsedLabelItems(self,min_used=1,close_connect=True):
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select title,fid,superid,cc,description,reshape_rule from labels where used_times>={} order by used_times DESC".format(min_used))
labels = lblcur.fetchall()
lblcur.close()
mostUsedItems=[]
for label in labels:
rules=label[5].split(",") if label[5] else []
mostUsedItems.append(LabelItem(label[0], label[1], label[2], label[3],label[4], rules))
if close_connect:
self.close()
return mostUsedItems
def getLabelItemsByIds(self,ids=[],close_connect=True):
if not self.connect(): return
self.labelItems.clear()
if len(ids)>0:
lblcur = self.connection.cursor()
lblcur.execute("select title,fid,superid,cc,description,reshape_rule from labels where fid in ({})".format(",".join(ids)))
labels = lblcur.fetchall()
lblcur.close()
for label in labels:
rules=label[5].split(",") if label[5] else []
self.labelItems.append(LabelItem(label[0], label[1], label[2], label[3],label[4], rules))
if close_connect:
self.close()
def toBlob(self,labelSample):
'''
将numpy数组转换为sqlite的blob
'''
import copy
blobLS=copy.deepcopy(labelSample)
if hasattr(labelSample,"gebd") and labelSample.gebd is not None:
blobLS.gebd=sqlite3.Binary(labelSample.gebd.tobytes())
if hasattr(labelSample,"lebd1") and labelSample.lebd1 is not None:
blobLS.lebd1=sqlite3.Binary(labelSample.lebd1.tobytes())
if hasattr(labelSample,"lebd2") and labelSample.lebd2 is not None:
blobLS.lebd2=sqlite3.Binary(labelSample.lebd2.tobytes())
if hasattr(labelSample,"samp_img") and labelSample.samp_img is not None:
blobLS.samp_img=sqlite3.Binary(cv2.imencode(".jpg", labelSample.samp_img)[1].tobytes())
if hasattr(labelSample,"ghis") and labelSample.ghis is not None:
#print_log("Before insert Ghis:",labelSample.ghis.shape,labelSample.ghis.dtype)
blobLS.ghis=sqlite3.Binary(labelSample.ghis.tobytes())
if hasattr(labelSample,"lhis") and labelSample.lhis is not None:
#print_log("Before insert Lhis:",labelSample.lhis.shape,labelSample.lhis.dtype)
blobLS.lhis=sqlite3.Binary(labelSample.lhis.tobytes())
return blobLS
def insertLabelSample(self,labelSample,close_connect=True):
tosave=self.toBlob(labelSample)
if not self.connect(): return
try:
lblcur = self.connection.cursor()
attrs=[attr for attr in dir(tosave) if "__" not in attr and "set_" not in attr]
sql="insert into samples ( "
vstr="values("
i=0
values=[]
for attr in attrs:
if attr=='fid':
continue
v=getattr(tosave,attr)
if v is not None:
if i>0:
sql +=","
vstr+=","
sql+= attr
vstr+="?"
values.append(v)
i=i+1
vstr+=")"
sql = sql+ ")" +vstr
values=tuple(values)
lblcur.execute(sql,values)
self.connection.commit()
id = lblcur.lastrowid
#print_log("insert a sample into db for label: ", labelSample.labelid)
return id
except sqlite3.Error as error:
print_log("Failed to insert item into table,", error)
finally:
lblcur.close()
if close_connect:
self.close()
def updatelabelSampleFeatid(self,sampid,featid,close_connect=True):
if not self.connect(): return
try:
lblcur = self.connection.cursor()
lblcur.execute("update samples set src_featid={} where fid={}".format(featid,sampid))
self.connection.commit()
#print_log("Total number of rows updated :", self.connection.total_changes)
except sqlite3.Error as error:
print_log("Failed to update src_featid", error)
finally:
lblcur.close()
if close_connect:
self.close()
def updateLabelSampleLabelid(self,featid,labelid,close_connect=True):
if not self.connect(): return
try:
lblcur = self.connection.cursor()
lblcur.execute("update samples set labelid={} where src_featid={}".format(labelid,featid))
self.connection.commit()
#print_log("Total number of rows updated :", self.connection.total_changes)
except sqlite3.Error as error:
print_log("Failed to update labelid", error)
finally:
lblcur.close()
if close_connect:
self.close()
def updateLabelSample(self,labelSample,fields2update,close_connect=True):
assert labelSample.fid>0, "Please specify the fid of the sample you want update"
tosave=self.toBlob(labelSample)
if not self.connect(): return
try:
lblcur = self.connection.cursor()
attrs=[attr for attr in dir(tosave) if "__" not in attr and "set_" not in attr]
sql="update samples set "
i=0
values=[]
for attr in attrs:
if attr=='fid':
continue
v=getattr(tosave,attr)
if attr in fields2update and v is not None:
if i>0:
sql +=","
sql+= "{} = ? ".format(attr)
values.append(v)
i=i+1
sql += " where fid=? "
values.append(labelSample.fid)
values=tuple(values)
lblcur.execute(sql,values)
self.connection.commit()
#print_log("Total number of rows updated :", self.connection.total_changes)
except sqlite3.Error as error:
print_log("Failed to update item into table", error)
finally:
lblcur.close()
if close_connect:
self.close()
def delteLabelSample(self,sampleid,close_connect=True):
if not self.connect(): return
try:
lblcur = self.connection.cursor()
lblcur.execute("delete from samples where fid={})".format(sampleid))
self.connection.commit()
#print_log("Total number of rows deleted :", self.connection.total_changes)
except sqlite3.Error as error:
print_log("Failed to delete item", error)
finally:
lblcur.close()
if close_connect:
self.close()
def refresh_pca_agents_of_alllabels(self,progress_bar,min_usedtimes=5,close_connect=True):
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select labelid from (select labelid,count(*) as nums from samples group by labelid) where nums>{}".format(min_usedtimes))
labels = lblcur.fetchall()
lblcur.close()
if progress_bar is not None:
self.set_progress_range(progress_bar,[0,len(labels)])
i=0
for labelid in labels:
self.refresh_pca_of_label(labelid[0],close_connect=False)
self.refresh_agents_of_label(labelid[0],close_connect=False)
self.update_progress(progress_bar, i)
QApplication.processEvents()
i=i+1
if close_connect:
self.close()
def refresh_pca_of_label(self,labelid,close_connect=True):
'''
提取labelid的embeding,如果超过5个,就计算所有该labell所有embedding的主成分,保存到ebdpca表中
供判断相似性时使用
PCAEngine:用于计算PCA的对象
'''
def PCA(data,n_components=1):
'''
data为一个R行C列的二维nparray,表示R个C维向量
目的是生成较少数量(1个或最多3个)C维向量,用其其代表R个向量,这里的主要目标不是降低向量维度,而是生成能够代表R个样本的少数样本
return: 如果n_components为大于等1的整数,就返回指定个数的主成分,如果为0-1的小数,就返回累计得分超过该值的所有主成分,否则返回所有
'''
data=np.squeeze(data)
samp_mean=np.mean(data, axis=0) #各样本的C维平均值向量
A = data.T #转置,将R视为要降低的维数
MEAN = np.mean(A, axis=0) # 沿轴0调用mean函数
# 去中心化
X = np.subtract(A, MEAN)
# 计算协方差矩阵
COV = np.dot(X.T, X)
# 计算特征值和特征向量
W, V = np.linalg.eig(COV)
# 计算主成分贡献率以及累计贡献率
sum_lambda = np.sum(W) # 特征值的和
ratios = np.divide(W, sum_lambda) # 每个特征值的贡献率(特征值 / 总和)
components=[]
for i in range(len(ratios)):#降维后的结果(恢复均值水平)
components.append(np.dot(A,V.T[i]))
if n_components>=1 and type(n_components)==int:
return components[:n_components],ratios[:n_components]
elif n_components>0 and n_components<1:
index=0
sum_ratio=0
for ratio in ratios:
index+=1
sum_ratio+=ratio
if sum_ratio>=n_components:
break
return components[:index],ratios[:index]
else:
return components,ratios
labelSamples=self.getLabelSamplesByLabelId(labelid,limit=-1,close_connect=close_connect)
gebds=[]
lebds=[]
for samp in labelSamples:
gebds.append(samp.gebd)
lebds.append(samp.lebd1)
gebds=np.array(gebds)
lebds=np.array(lebds)
g_pca,g_score = PCA(gebds, 1)
l_pca,l_score = PCA(lebds, 1)
#print_log("Label {}: The scores for the PCA of {} gembeddings:{}".format(labelid,len(labelSamples),g_score[0]))
#print_log("Label {}: The scores for the PCA of lembedding:{}".format(labelid,l_score[0]))
self.updateOrInsertLabelPCA(labelid,np.array(g_pca[0]),np.array(l_pca[0]),int(g_score[0]*100),int(l_score[0]*100),close_connect=close_connect)
def get_cluster_cores(self,vectors):
'''
对样本进行聚类,并提取各个聚簇中的核心样本作为该类的代表。
OPTICS聚类方法的结果可能可以通过判断每个聚簇中的reachability最小的样本作为代表样本,但比较麻烦,是否正确有待测试
DBSCAN聚类方法返回core_sample_indices_,包含聚簇中包含的所有样本的索引,只能从中选取,但选取标准无法确定,因此选取结果的典型性无从判定
HDBSACN通过指定store_centers="medoid"参数,可以在medoids_属性中返回代表每一个聚簇的一个典型样本,该样本为聚簇中与其他样本之间距离最小的样本,其典型性较好
因此此处采用HDBSACN算法
'''
from sklearn.cluster import HDBSCAN
X = np.squeeze(vectors)
#必须指定allow_single_cluster=True,否则如果聚类结果只有一个聚簇,结果就会返回没有聚簇。
min_cluster_size=5 if X.shape[0]>=5 else X.shape[0]
hdb = HDBSCAN(min_cluster_size=min_cluster_size,n_jobs=-1,store_centers="medoid",allow_single_cluster=True).fit(X)
#返回向量的个数为聚类后的聚簇数
return hdb.medoids_
def refresh_agents_of_label(self,labelid,close_connect=True):
'''
todo:to be finished,采用聚类簇的中心比基于centroid距离排序法更好。
提取labelid的embeding,如果超过5个,就通过聚类计算该labell所有embedding的最具代表性的3个样本,保存到ebdpca表中的agents列中
供判断相似性时使用
'''
def get_agents(data, n_components=3):
'''
data为一个R行C列的二维nparray,表示R个C维向量
目的是找出其中法人3个C维向量,用其其代表全部R个向量
return: 如果n_components为大于等1的整数,就返回指定个数的代表向量
'''
data = np.squeeze(data)
samp_mean = np.mean(data, axis=0) # 各样本的C维平均值向量
distances=np.linalg.norm(data-samp_mean,axis=1)
sorted_indeices=np.argsort(distances)
result=data[sorted_indeices[:n_components]]
return result
labelSamples=self.getLabelSamplesByLabelId(labelid,limit=-1,close_connect=False)
if len(labelSamples)>3: #如果样本不多于5个,没必要获取代表向量
gebds=[]
lebds=[]
for samp in labelSamples:
gebds.append(samp.gebd)
lebds.append(samp.lebd1)
gebds=np.array(gebds)
lebds=np.array(lebds)
g_agents = self.get_cluster_cores(gebds)
l_agents = self.get_cluster_cores(lebds)
self.updateOrInsertLabelAgents(labelid,np.array(g_agents),np.array(l_agents),close_connect=False)
for samp in labelSamples:
self.update_sim_with_agents(g_agents,l_agents,samp.id,close_connect=close_connect)
def getLabelPCA(self,labelid,close_connect=True):
'''
返回存储在数据库中的label的主成分矢量
'''
if not self.connect(): return
lblcur = self.connection.cursor()
sql="select gpca,lpca from ebdpca where labelid={}".format(labelid)
#print_log(sql)
lblcur.execute(sql)
pcas = lblcur.fetchone()
lblcur.close()
if pcas:
gpca=np.frombuffer(pcas[0],dtype=np.float32).reshape((1, -1))
lpca=np.frombuffer(pcas[1],dtype=np.float32).reshape((1, -1))
else:
gpca = None
lpca = None
if close_connect:
self.close()
return gpca,lpca
def updateOrInsertLabelPCA(self,labelid,gpca,lpca,g_score,l_score,close_connect=True):
'''
存储主成分矢量
'''
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select count(*) from ebdpca where labelid={}".format(labelid))
result = lblcur.fetchone()
if result[0]>0:
sql="update ebdpca set gpca=?,lpca=?,gscore=?,lscore=? where labelid={}".format(labelid)
lblcur.execute(sql, (sqlite3.Binary(gpca.tobytes()), sqlite3.Binary(lpca.tobytes()),g_score,l_score))
else:
sql="insert into ebdpca(labelid,gpca,lpca,gscore,lscore) values(?,?,?,?,?)"
lblcur.execute(sql, (labelid,sqlite3.Binary(gpca.tobytes()), sqlite3.Binary(lpca.tobytes()),g_score,l_score))
self.connection.commit()
id = lblcur.lastrowid
#print_log("updateOrInsertLabelPCA, Inserted id:",id)
lblcur.close()
if close_connect:
self.close()
def updateOrInsertLabelAgents(self,labelid,g_agents,l_agents,close_connect=True):
'''
保存label的若干代表性样本(1-N,N为通过HDBSCAN算法聚类结果的聚簇数)
'''
if not self.connect(): return
lblcur = self.connection.cursor()
#print_log("In LabelBase.updateOrInsertLabelAgents:",labelid,g_agents.shape,l_agents.shape)
lblcur.execute("select count(*) from ebdpca where labelid={}".format(labelid))
result = lblcur.fetchone()
if result[0]>0:
sql="update ebdpca set gagents=?,lagents=?, gagent_count=?,lagent_count=? where labelid={}".format(labelid)
lblcur.execute(sql, (sqlite3.Binary(g_agents.tobytes()), sqlite3.Binary(l_agents.tobytes()),g_agents.shape[0],l_agents.shape[0]))
else:
sql="insert into ebdpca(labelid,gagents,lagents,gagent_count,lagent_count) values(?,?,?,?,?)"
lblcur.execute(sql, (labelid,sqlite3.Binary(g_agents.tobytes()), sqlite3.Binary(l_agents.tobytes()),g_agents.shape[0],l_agents.shape[0]))
self.connection.commit()
id = lblcur.lastrowid
print_log("updateOrInsertLabelAgents, Inserted id:",id)
lblcur.close()
if close_connect:
self.close()
def update_sim_with_pca(self,labelid,sampleid,close_connect=True):
'''
更新sampleid对应的sample与其label主成分PCA的相似度
'''
def cos_sim(vector_a, vector_b):
"""
计算两个向量之间的余弦相似度
:param vector_a: 向量 a
:param vector_b: 向量 b
:return: sim
"""
vector_a = np.mat(vector_a)
vector_b = np.mat(vector_b)
num = float(vector_a * vector_b.T)
denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
sim = num / denom
return sim
label_sample = self.getLabelSampleById(sampleid,close_connect=close_connect)
gpca,lpcal=self.getLabelPCA(labelid,close_connect=close_connect)
gsim_pca=cos_sim(gpca,label_sample.gebd) if gpca else 0
lsim_pca=cos_sim(lpcal,label_sample.lebd1) if lpca else 0
if gsim_pca>0 or lsim_pca>0:
if not self.connect(): return
try:
lblcur = self.connection.cursor()
lblcur.execute("update samples set gsim_pca=?, lsim_pca=? where sampleid=?",
(gsim_pca,lsim_pca,sampleid))
self.connection.commit()
#print_log("Total number of rows updated :", self.connection.total_changes)
except sqlite3.Error as error:
print_log("Failed to update item into table", error)
finally:
lblcur.close()
if close_connect:
self.close()
def getLabelsAgentsPcaByIds(self,labelids,close_connect=True):
'''
返回存储在数据库中的label的代表性样本和主成分结果
为{labelid=[gagents,lagents,gpca,lpca]}组成的Dict
'''
if not self.connect(): return
lblcur = self.connection.cursor()
ids_str=",".join([str(id) for id in labelids])
sql="select labelid,gagents,lagents,gpca,lpca,gagent_count,lagent_count from ebdpca where gagents is not Null and lagents is not Null and labelid in ({})".format(ids_str)
#print_log("In getLabelsAgentsPcaByIds:",sql)
lblcur.execute(sql)
results = lblcur.fetchall()
lblcur.close()
agents={}
if results:
for agent in results:
#print_log("In labelBase.getLabelAgentsById:",agent[0],agent[5],agent[6])
gagents=np.frombuffer(agent[1],dtype=np.float64).reshape((agent[5], -1))
lagents=np.frombuffer(agent[2],dtype=np.float64).reshape((agent[6], -1))
gpca = np.frombuffer(agent[3], dtype=np.float32).reshape((1, -1))
lpca = np.frombuffer(agent[4], dtype=np.float32).reshape((1, -1))
#print_log("In labelBase.getLabelAgentsById:",gagents.shape,lagents.shape)
#print_log("In getLabelsAgentsPcaByIds:",agent[0],type(agent[0]))
agents[agent[0]]=[gagents,lagents,gpca,lpca]
if close_connect:
self.close()
return agents
def getLabelAgentsById(self, labelid, close_connect=True):
'''
返回存储在数据库中的label的主成分矢量
'''
if not self.connect(): return
lblcur = self.connection.cursor()
sql="select gagents,lagents,gagent_count,lagent_count from ebdpca where gagents is not Null and lagents is not Null and labelid={}".format(labelid)
#print_log(sql)
lblcur.execute(sql)
agents = lblcur.fetchone()
lblcur.close()
if agents:
#print_log("In labelBase.getLabelAgentsById:",labelid,agents[2],agents[3])
gagents=np.frombuffer(agents[0],dtype=np.float64).reshape((agents[2], -1))
lagents=np.frombuffer(agents[1],dtype=np.float64).reshape((agents[3], -1))
#print_log("In labelBase.getLabelAgentsById:",gagents.shape,lagents.shape)
#print_log(gagents)
else:
gagents = None
lagents = None
if close_connect:
self.close()
return gagents,lagents
def update_sim_with_agents(self,gagents,lagents,sampleid,close_connect=True):
'''
更新sampleid对应的sample与其label各个代表性样本的相似度的最大值
'''
def cos_sim(vector_a, vector_b):
"""
计算两个向量之间的余弦相似度
:param vector_a: 向量 a
:param vector_b: 向量 b
:return: sim
"""
vector_a = np.mat(vector_a)
vector_b = np.mat(vector_b)
num = float(vector_a * vector_b.T)
denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
sim = num / denom
return sim
label_sample = self.getLabelSampleById(sampleid,close_connect=close_connect)
max_gsim = 0
max_lsim = 0
if gagents is not None and lagents is not None:
for gagent in gagents:
gsim=cos_sim(gagent,label_sample.gebd) if gagents is not None else 0
if gsim>max_gsim:
max_gsim=gsim
for lagent in lagents:
lsim=cos_sim(lagent,label_sample.lebd1) if lagents is not None else 0
if lsim>max_lsim:
max_lsim=lsim
if max_gsim>0 or max_lsim>0:
if not self.connect(): return
try:
lblcur = self.connection.cursor()
lblcur.execute("update samples set gsim_agents=?, lsim_agents=? where fid=?",
(max_gsim,max_lsim,sampleid))
self.connection.commit()
#print_log("Total number of rows updated :", self.connection.total_changes)
except sqlite3.Error as error:
print_log("Failed to update item into table", error)
finally:
lblcur.close()
if close_connect:
self.close()
def update_or_insert_siminfo_between(self,fromsampid,withsampid,overwrite=True,close_connect=True,skipExisting=False):
'''
更新或插入两个sample之间的相似性计算结果
'''
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute(
"select count(*) from similarity where fromsampid={} and withsampid={}".format(fromsampid, withsampid))
result = lblcur.fetchone()
if result[0] > 0 and skipExisting:
pass
else:
fromsamp=self.getLabelSampleById(fromsampid,close_connect=False)
withsamp=self.getLabelSampleById(withsampid,close_connect=False)
ggsim=cosine_similarity(fromsamp.gebd, withsamp.gebd)
glsim=cosine_similarity(fromsamp.gebd, withsamp.lebd1)
llsim=cosine_similarity(fromsamp.lebd1, withsamp.lebd1)
lgsim=cosine_similarity(fromsamp.lebd1, withsamp.gebd)
ggcorrcoef=correlation_coefficient(fromsamp.gebd, withsamp.gebd)
glcorrcoef=correlation_coefficient(fromsamp.gebd, withsamp.lebd1)
llcorrcoef=correlation_coefficient(fromsamp.lebd1, withsamp.lebd1)
lgcorrcoef=correlation_coefficient(fromsamp.lebd1, withsamp.gebd)
mutualinfo=mutual_info(fromsamp.samp_img, withsamp.samp_img)
gghissim=cosine_similarity(fromsamp.ghis, withsamp.ghis)
glhissim=cosine_similarity(fromsamp.ghis, withsamp.lhis)
llhissim=cosine_similarity(fromsamp.lhis, withsamp.lhis)
lghissim=cosine_similarity(fromsamp.lhis, withsamp.ghis)
maxsim=max([ggsim,glsim,llsim,lgsim,ggcorrcoef,glcorrcoef,llcorrcoef,lgcorrcoef,gghissim,glhissim,llhissim,lghissim])
#if all similarities are all less than 50 for the different label, then do not store in database to save space
if (fromsamp.labelid!=withsamp.labelid and maxsim>50) or fromsamp.labelid==withsamp.labelid:
if result[0]>0 and overwrite:
sql="update similarity set ggsim=?,glsim=?, llsim=?,lgsim=?,ggcorrcoef=?,glcorrcoef=?,llcorrcoef=?,lgcorrcoef=?," \
"mutualinfo=?,gghissim=?,glhissim=?,llhissim=?,lghissim=? where fromsampid={} and withsampid={}".format(fromsampid,withsampid)
lblcur.execute(sql, (ggsim,glsim,llsim,lgsim,ggcorrcoef,glcorrcoef,llcorrcoef,lgcorrcoef,
mutualinfo,gghissim,glhissim,llhissim,lghissim
)
)
else:
sql="insert into similarity(fromsampid,withsampid,ggsim,glsim,llsim,lgsim,ggcorrcoef,glcorrcoef,llcorrcoef,lgcorrcoef," \
"mutualinfo,gghissim,glhissim,llhissim,lghissim) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
#print_log("********************************************\n", fromsamp.ghis.shape,withsamp.ghis.shape,fromsamp.ghis, withsamp.ghis)
#print_log("********************************************\n", fromsamp.lhis.shape,withsamp.lhis.shape,fromsamp.lhis, withsamp.lhis)
lblcur.execute(sql, (fromsampid,withsampid,
ggsim, glsim, llsim, lgsim, ggcorrcoef, glcorrcoef, llcorrcoef, lgcorrcoef,
mutualinfo, gghissim, glhissim, llhissim, lghissim
)
)
self.connection.commit()
id = lblcur.lastrowid
#print_log("Inserted id:",id)
lblcur.close()
if close_connect:
self.close()
def update_sim_with_eachother(self,overwrite=True,progress_bar=None,close_connect=True,skipExisting=False):
'''
更新sample两两之间的相似性计算结果
overwrite:如果已经存在,是否覆盖更新
'''
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select fid,labelid from samples order by fid DESC")
result = lblcur.fetchall()
lblcur.close()
if progress_bar is not None:
self.set_progress_range(progress_bar,[0,len(result)**2/2])
i=0
for fromid in result:
for withid in result:
i = i + 1
if progress_bar is not None:
self.update_progress(progress_bar, i)
QApplication.processEvents()
if fromid[0]<=withid[0]:
continue
self.update_or_insert_siminfo_between(fromid[0],withid[0],overwrite=overwrite,close_connect=False,skipExisting=skipExisting)
if self.interruptLongTransaction:
break
if self.interruptLongTransaction:
self.update_progress(progress_bar, len(result)**2/2)
break
if close_connect:
self.close()
self.setInterruptLongTransaction(False)
def update_sim_with_sameLabeled(self,this_sampleid,overwrite=True,progress_bar=None,close_connect=True):
'''
更新指定sample与同一类其他sample两两之间的相似性计算结果
overwrite:如果已经存在,是否覆盖更新
'''
if not self.connect(): return
lblcur = self.connection.cursor()
lblcur.execute("select labelid from samples where fid={}".format(this_sampleid))
labelid = lblcur.fetchone()
if labelid:
lblcur.execute("select fid from samples where fid<>{} and labelid={}".format(this_sampleid,labelid[0]))
result = lblcur.fetchall()
lblcur.close()
if progress_bar is not None:
self.set_progress_range(progress_bar,[0,len(result)**2])
i=0
for withid in result:
i = i + 1
if progress_bar is not None:
self.update_progress(progress_bar, i)
self.update_or_insert_siminfo_between(this_sampleid,withid[0],overwrite=overwrite,close_connect=False)
if close_connect:
self.close()
def update_progress(self,progress_bar,value):
progress_bar.setValue(value)
def set_progress_range(self,progress_bar,range):
progress_bar.setRange(range[0],range[1])
def setInterruptLongTransaction(self,state=True):
self.interruptLongTransaction=state
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。