diff --git a/feature_extract/extract_rpm.py b/feature_extract/extract_rpm.py index 3bcabed64116cfc4650e21744d958245c5cef728..c8648b2fe7cdb1ba7bfb3433fe0c087b88022ef9 100644 --- a/feature_extract/extract_rpm.py +++ b/feature_extract/extract_rpm.py @@ -2,6 +2,7 @@ import argparse import os import xml.etree.ElementTree as ET import json +import re def strip_namespace(tag_name): @@ -26,6 +27,7 @@ def process_xml_file(file_path): elif strip_namespace(thirdLevel.tag) == 'version': data[count][strip_namespace(thirdLevel.tag)] = thirdLevel.get('ver').strip() if thirdLevel.get( 'ver') else '' + # 是否可以考虑取消summary、description,不同系统的差异性较大 elif strip_namespace(thirdLevel.tag) == 'summary': data[count][strip_namespace(thirdLevel.tag)] = thirdLevel.text.strip() if thirdLevel.text else '' elif strip_namespace(thirdLevel.tag) == 'description': @@ -49,15 +51,20 @@ def process_xml_file(file_path): if 'requires' in data[count]: requires = data[count]['requires'] new_requires = [] - for r in requires: - if r.startswith('/'): + for require_item in requires: + if require_item.startswith('/'): continue - if r.find('>=') != -1: - new_requires.append(r.split('>=')[0].strip()) - elif r.find('>') != -1: - new_requires.append(r.split('>')[0].strip()) - else: - new_requires.append(r.strip()) + if require_item.find('>=') != -1: + require_item = require_item.split('>=')[0] + elif require_item.find('>') != -1: + require_item = require_item.split('>')[0] + require_item = require_item.strip() + # 统一命名规范 + require_item = re.sub(r'-devel$', '-dev', require_item) + require_item = re.sub(r'-help$', '-doc', require_item) + require_item = require_item.strip() + if require_item not in new_requires: + new_requires.append(require_item) data[count]['requires'] = new_requires else: data[count]['requires'] = [] diff --git a/feature_extract/extract_spec.py b/feature_extract/extract_spec.py index 56cc07e90fd5445b79d030a874903d26f843c8de..445cc9d5d307bc14effa8072fac79de2669f2ddf 100644 --- a/feature_extract/extract_spec.py +++ b/feature_extract/extract_spec.py @@ -2,6 +2,8 @@ import argparse import os import subprocess import json +import re +import shutil from tqdm import tqdm if __name__ == '__main__': @@ -50,34 +52,60 @@ if __name__ == '__main__': output_binary = subprocess.run(['rpmspec', '-q', os.path.join(subfolder, file)], cwd=subfolder, stdout=subprocess.PIPE, stderr=subprocess.PIPE) binary_list = output_binary.stdout.decode().strip().splitlines() - data[count]['binaryList'] = binary_list + # 去掉版本信息 + binary_list_new = [] + for item in binary_list: + binary_tmp = re.sub(r'-[0-9].*', '', item) + if binary_tmp not in binary_list_new: + binary_list_new.append(binary_tmp) + binary_list_new.sort() + data[count]['binaryList'] = binary_list_new output_providers = subprocess.run(['rpmspec', '-q', '--provides', os.path.join(subfolder, file)], cwd=subfolder, stdout=subprocess.PIPE, stderr=subprocess.PIPE) provides = output_providers.stdout.decode().strip().splitlines() - new_provides = [] - for p in provides: - if p.find('=') != -1: - new_provides.append(p.split('=')[0].strip()) - else: - new_provides.append(p.strip()) - data[count]['provides'] = new_provides + provides_new = [] + for item in provides: + if item.find('=') != -1: + item = (item.split('=')[0].strip()) + + # 去掉冗余信息 + item = re.sub(r'\(aarch-64\)', '', item).strip() + item = re.sub(r'-debuginfo$', '', item).strip() + item = re.sub(r'-debugsource$', '', item).strip() + if item not in provides_new: + provides_new.append(item) + provides_new.sort() + data[count]['provides'] = provides_new output_buildrequires = subprocess.run( ['rpmspec', '-q', '--buildrequires', os.path.join(subfolder, file)], cwd=subfolder, stdout=subprocess.PIPE, stderr=subprocess.PIPE) build_requires = output_buildrequires.stdout.decode().strip().splitlines() - new_build_requires = [] - for br in build_requires: - if br.startswith('/'): + build_requires_new = [] + ignore_ends=['-dev', '-doc', '-devel', '-help'] + for item in build_requires: + if item.startswith('/'): continue - if br.find('>=') != -1: - new_build_requires.append(br.split('>=')[0].strip()) - elif br.find('>') != -1: - new_build_requires.append(br.split('>')[0].strip()) - else: - new_build_requires.append(br.strip()) - data[count]['buildRequires'] = new_build_requires + if item.find('>=') != -1: + item = item.split('>=')[0].strip() + elif item.find('>') != -1: + item = item.split('>')[0].strip() + + # 去掉-dev和-doc + ignore_flag = False + for ending in ignore_ends: + if item.endswith(ending): + ignore_flag = True + break + if ignore_flag: + continue + #item = re.sub(r'-devel$', '-dev', item) + #item = re.sub(r'-help$', '-doc', item) + if item not in build_requires_new: + build_requires_new.append(item.strip()) + build_requires_new.sort() + data[count]['buildRequires'] = build_requires_new try: source0_command = "rpmspec -P " + os.path.join(subfolder, file) + " | grep Source0:" @@ -91,6 +119,26 @@ if __name__ == '__main__': data[count]['source0'] = "" if file == 'src': + # 如果不存在源码,或者源码与spec文件同级,则将spec同级下的源码复制到src + src_path = os.path.join(subfolder, file) + file_count = 0 + for _, dirs, files in os.walk(src_path): + file_count += len(dirs) + file_count += len(files) + if 0 == file_count: + for root, dirs, files in os.walk(subfolder): + for file_name in files: + if file_name != 'filelist' \ + and not file_name.endswith('.patch') \ + and not file_name.endswith('.spec'): + shutil.copy2(os.path.join(root, file_name), src_path) + for dir_name in dirs: + if dir_name != 'src': + target_dir_path = os.path.join(src_path, dir_name) + os.makedirs(target_dir_path, exist_ok=True) + shutil.copytree(os.path.join(root, dir_name), target_dir_path, dirs_exist_ok=True) + break + command_str_macro = "grep -E -Irho '\<[A-Z]+_[A-Z]+\>' '" + os.path.join(subfolder, file) + "' | sort | uniq -c | sort -nr | head -10" macro_str = subprocess.getoutput(command_str_macro) @@ -129,8 +177,8 @@ if __name__ == '__main__': data[count]['email_names'] = [] # data[count]['email_counts'] = [] - command_str_class = "grep -rho '[A-Z][a-z]\{3,\}[A-Z][a-z]\{3,\}' '" + os.path.join(subfolder, - file) + "' | sort | uniq -c | sort -nr | head -10" + command_str_class = ("grep -Irho '[A-Z][a-z]\{3,\}[A-Z][a-z]\{3,\}' '" + + os.path.join(subfolder, file) + "' | sort | uniq -c | sort -nr | head -10") class_str = subprocess.getoutput(command_str_class) if class_str != '': class_list = class_str.split('\n') @@ -144,21 +192,29 @@ if __name__ == '__main__': else: data[count]['class_names'] = [] - command_str_path = "grep -E -Irho '\"/[A-Za-z.]+(/[A-Za-z.]+)*\"' '" + os.path.join(subfolder, - file) + "' | sort | uniq -c | sort -nr | head -10" + # path_names优先保存3层路径的 + command_str_path = ("grep -E -Irho '\"/[A-Za-z.]+(/[A-Za-z.]+){2,}\"' '" + + os.path.join(subfolder, file) + "' | sort | uniq -c | sort -nr | head -10") path_str = subprocess.getoutput(command_str_path) + path_names = [] if path_str != '': path_list = path_str.split('\n') - path_names = [] for path in path_list: if path.find("Permission denied") == -1: path_names.append(path.strip().split(' ')[1].replace('\"', '')) - else: - continue - data[count]['path_names'] = path_names - else: - data[count]['path_names'] = [] - + if len(path_names) < 10: + num = 10 - len(path_names) + # grep -E -Irho '\"/[A-Za-z.]+(/[A-Za-z.]+){2,}\"' src | sort | uniq -c | for item in "${array[@]}"; do echo "${item}|$(echo "${item}" | tr -cd '/' | wc -c)"; done | sort -t'|' -k2 -nr | cut -d'|' -f1 + command_str_path = ("grep -E -Irho '\"/[A-Za-z.]+(/[A-Za-z.]+){2,}\"' '" + + os.path.join(subfolder, file) + "' | sort | uniq -c | sort -nr | head -" + str(num)) + path_str = subprocess.getoutput(command_str_path) + if path_str != '': + path_list = path_str.split('\n') + for path in path_list: + if path.find("Permission denied") == -1: + path_names.append(path.strip().split(' ')[1].replace('\"', '')) + data[count]['path_names'] = path_names + command_str_url = "grep -E -Irho '\"(https?|ftp)://([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,6}(/.*)?\"' '" + os.path.join( subfolder, file) + "' | sort | uniq -c | sort -nr | head -10" url_str = subprocess.getoutput(command_str_url) diff --git a/get_package/download-openeuler-src.sh b/get_package/download-openeuler-src.sh new file mode 100644 index 0000000000000000000000000000000000000000..13e341e167cf39d6d4deca33b11e47ae7383f349 --- /dev/null +++ b/get_package/download-openeuler-src.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +. ../lib/lib_rpm.sh + +base_dir="/home/lxb/pkgmapping/openeuler/" +version="openEuler-24.03-LTS" +project_log_file="${base_dir}/${version}/log/download-log" + +src_rpm_url=( + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/EPOL/main/source/repodata/repomd.xml" + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/EPOL/update/main/source/repodata/repomd.xml" + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/update/source/repodata/repomd.xml" + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/source/repodata/repomd.xml" +) + +# 与src_rpm_url中的元素(链接)对应的目录 +src_rpm_dir_name=( + "EPOL-main" + "EPOL-update" + "source" + "update" +) + +# rmp包下载 +log_msg "rmp包下载 begin ..." +work_path="${base_dir}/${version}/srpm" +if [ ! -d "${work_path}" ]; then + mkdir -p "${work_path}" + chmod 775 "${work_path}" +fi + +# 此处比较,是为了将不同的仓库放到对应的目录下 +if [ ${#src_rpm_url[@]} != ${#src_rpm_dir_name[@]} ]; then + log_msg "src_rpm_url element nums[${#src_rpm_url[@]}] not equal src_rpm_dir_name element nums[${#src_rpm_dir_name[@]}]" + exit 1 +fi + + +i=0 +for ((i=0; i < ${#src_rpm_url[@]}; i++)) +do + work_path_tmp="${work_path}/${src_rpm_dir_name[i]}" + if [ ! -d "${work_path_tmp}" ]; then + mkdir -p "${work_path_tmp}" + chmod 775 "${work_path_tmp}" + fi + cd "${work_path_tmp}" || exit + + url="${src_rpm_url[i]}" + + if [ -z "${url}" ]; then + continue + fi + + download_rpm_by_repomdxml "${url}" "${work_path_tmp}" +done \ No newline at end of file diff --git a/get_package/download-openeuler-xml.sh b/get_package/download-openeuler-xml.sh new file mode 100644 index 0000000000000000000000000000000000000000..e891d276906aaee5de18563a00fd6277a1b6dbb2 --- /dev/null +++ b/get_package/download-openeuler-xml.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +. ../lib/lib_rpm.sh + +base_dir="/home/lxb/pkgmapping/openeuler/" +version="openEuler-24.03-LTS" + +src_repomd_xml_urls=( + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/EPOL/main/source/repodata/repomd.xml" + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/EPOL/update/main/source/repodata/repomd.xml" + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/update/source/repodata/repomd.xml" + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/source/repodata/repomd.xml" +) + +binary_xml_urls=( + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/update/x86_64/repodata/repomd.xml" + "https://mirrors.aliyun.com/openeuler/openEuler-24.03-LTS/update/aarch64/repodata/repomd.xml" +) + + +download_binary_primary_xml_repo() +{ + for url in "${src_repomd_xml_urls[@]}" + do + if [ -z "${url}" ]; then + continue + fi + + download_primary_xml_by_repomdxml "${url}" "${base_dir}/${version}/rpmxml" + done +} + + +download_binary_filelist_xml_repo() +{ + if [ ! -d "${base_dir}/${version}/bianryxml" ]; then + mkdir -p "${base_dir}/${version}/binaryxml" + chmod 775 "${base_dir}/${version}/binaryxml" + fi + + cd "${base_dir}/${version}/binaryxml" || exit + + for url in "${binary_xml_urls[@]}" + do + if [ -z "${url}" ]; then + continue + fi + + rm -f repomd.xml + + # 下载repomd.xml文件 + wget "${url}" + if [ ! -f "./repomd.xml" ]; then + log_msg "wget repomd.xml error[${url}] ..." + continue + fi + + # 过滤出软件信息文件 + xml_gz_file_name=$(grep "filelists.xml.gz" ./repomd.xml | awk -F'/' '{print $2}' | awk -F'"' '{print $1}') + + # 下载软件信息文件 + url_xml_base=$(echo "${url}" | awk -F'/repomd.xml' '{print $1}') + url_xml_file="${url_xml_base}/${xml_gz_file_name}" + wget "${url_xml_file}" + if [ ! -f "${xml_gz_file_name}" ]; then + log_msg "wget xml file error [${url_xml_file}] ..." + continue + fi + done + + rm -f repomd.xml + gzip -d ./*filelists.xml.gz +} + +download_binary_primary_xml_repo +download_binary_filelist_xml_repo \ No newline at end of file diff --git a/lib/lib_py_es.py b/lib/lib_py_es.py new file mode 100644 index 0000000000000000000000000000000000000000..05eab7e7cb4b0b2ccaa8f3db23440aef514e186b --- /dev/null +++ b/lib/lib_py_es.py @@ -0,0 +1,65 @@ +import sys +import re + +sys.path.append('..') +import config.constant +from utils.es_util import ES as es + +# 去掉版本信息 +def get_inf_ignore_version(src:str): + return re.sub(r'-[0-9][^/]*$', '', src).strip() + + +# 模糊匹配 +def es_search_record_by_name_like(src_index: str, src_param: str, src_param_value: str, des_parm: str, page_size: int = 100): + es_res = es.search(index=src_index, size=page_size, body={ + "query": { + "wildcard": { + src_param: src_param_value + } + } + }) + + res = [] + for item in es_res['hits']['hits']: + tmp = item['_source'][des_parm] + if tmp not in res: + res.append(tmp) + + return res + +# 正则查询 +def es_search_record_by_name_regexp(src_index: str, src_param: str, src_param_value: str, des_parm: str, page_size: int = 100): + es_res = es.search(index=src_index, size=page_size, body={ + "query": { + "regexp": { + src_param: src_param_value + } + } + }) + + res = [] + for item in es_res['hits']['hits']: + tmp = item['_source'][des_parm] + if tmp not in res: + res.append(tmp) + + return res + +# 精确匹配 +def es_search_record_by_name_term(src_index: str, src_param: str, src_param_value: str, des_parm: str, page_size: int = 100): + es_res = es.search(index=src_index, size=page_size, body={ + "query": { + "term": { + src_param: src_param_value + } + } + }) + + res = [] + for item in es_res['hits']['hits']: + tmp = item['_source'][des_parm] + if tmp not in res: + res.append(tmp) + + return res \ No newline at end of file diff --git a/lib/lib_rpm.sh b/lib/lib_rpm.sh new file mode 100644 index 0000000000000000000000000000000000000000..d74a99e58c7c78a3ed43c6a9eb36a259db2f066d --- /dev/null +++ b/lib/lib_rpm.sh @@ -0,0 +1,442 @@ +#!/bin/bash + +#------------------------------------------------------------------------------- +# 功能描述: +# 打印日志 +# +# 说明: +# 如果定义project_log_file环境变量,则会将日志打印到控制台,并输出到对应文件中; +# 否则只会将日志打印到控制台 +#------------------------------------------------------------------------------- +log_msg() +{ + if [ -z "${project_log_file}" ]; then + echo "$@" + else + if [ ! -f "${project_log_file}" ]; then + local log_file_path + log_file_path=$(dirname "${project_log_file}") + if [ ! -d "${log_file_path}" ]; then + mkdir -p "${log_file_path}" + chmod 775 "${log_file_path}" + fi + fi + + echo "$@" | tee -a "${project_log_file}" + fi +} + +#---------------------------------------- +# 功能描述:(老版本) +# 将源目录下的rpm包,解压到目标目录下 +#---------------------------------------- +uncompress_src_rpm_by_type_old() +{ + [ $# -ne 2 ] && log_msg "[error] compress_src_rpm_by_type_old args num error: [$#]" && exit 1 + + local src_rpm_dir_tmp="$1" + local des_rpm_dir_tmp="$2" + + if [ ! -d "${src_rpm_dir_tmp}" ]; then + log_msg "[error] src dir [${src_rpm_dir_tmp}] is not exist or not a dir ..." + exit 1 + fi + + if [ ! -d "${des_rpm_dir_tmp}" ]; then + log_msg "[error] des dir [${des_rpm_dir_tmp}] is not exist or not a dir ..." + exit 1 + fi + + log_msg "" + log_msg "[${src_rpm_dir_tmp}] begin ... " + log_msg "" + + find "${src_rpm_dir_tmp}" -name "*.src.rpm" | { + unzip_num=0; + while read -r file; do + ((unzip_num++)) + if [ -f "$file" ]; then + filename=$(basename "$file") # 获取文件名并保存到变量 + # shellcheck disable=SC2001 + folder=$(echo "$filename" | sed 's/-[0-9].*$//') + [ -d "${des_rpm_dir_tmp}/${folder}" ] && echo "${des_rpm_dir_tmp}/${folder} exist" && continue + rpm2cpio "$file" | cpio -div -D "${des_rpm_dir_tmp}/${folder}" + + unzip_des_dir_tmp="${des_rpm_dir_tmp}/${folder}/src" # 解压后的目标目录 + local file_list="${des_rpm_dir_tmp}/${folder}/filelist" # 文件列表 + + # 如果目标文件夹中src文件夹不存在,则创建它 + if [ ! -d "${unzip_des_dir_tmp}" ] ; then + mkdir "${unzip_des_dir_tmp}" + chmod 775 "${unzip_des_dir_tmp}" + fi + + # 如果文件列表保存文件不存在,则创建它 + if [ ! -f "${file_list}" ] ; then + touch "${file_list}" + chmod 775 "${file_list}" + fi + + # 获取压缩文件的路径 + local tar_folder="${des_rpm_dir_tmp}/${folder}" + chmod 775 "${tar_folder}" + + find "$tar_folder" -name "*.tar.*" | while read -r file + do + if [ -f "${file}" ] ; then + # 解压压缩文件到目标文件夹中 + case "${file}" in + *.gz) + tar -xzf "${file}" -C "${unzip_des_dir_tmp}" + ;; + *.bz2) + tar -xjf "${file}" -C "${unzip_des_dir_tmp}" + ;; + *.xz) + tar -xJf "${file}" -C "${unzip_des_dir_tmp}" + ;; + *) + # 此情况只会发生在数据输入错误。 + log_msg "Unsupported file format: [${file}]" + exit 1 + ;; + esac + fi + done + + fi + done + + log_msg "procing nums[${unzip_num}]" + } +} + + +#---------------------------------------- +# 功能描述:(新版本) +# 将源目录下的rpm包,解压到目标目录下 +#---------------------------------------- +uncompress_src_rpm_by_type_new() +{ + [ $# -ne 2 ] && log_msg "[error] compress_src_rpm_by_type_new args num error: [$#]" && exit 1 + + local src_rpm_dir_tmp="$1" + local des_rpm_dir_tmp="$2" + + if [ ! -d "${src_rpm_dir_tmp}" ]; then + log_msg "[error] src dir [${src_rpm_dir_tmp}] is not exist or not a dir ..." + exit 1 + fi + + if [ ! -d "${des_rpm_dir_tmp}" ]; then + log_msg "[error] des dir [${des_rpm_dir_tmp}] is not exist or not a dir ..." + exit 1 + fi + + log_msg "" + log_msg "[${src_rpm_dir_tmp}] begin ... " + log_msg "" + + find "${src_rpm_dir_tmp}" -name "*.src.rpm" | { + unzip_num=0; + while read -r file; do + ((unzip_num++)) + if [ -f "$file" ]; then + filename=$(basename "$file") # 获取文件名并保存到变量 + # shellcheck disable=SC2001 + folder=$(echo "$filename" | sed 's/-[0-9].*$//') + [ -d "${des_rpm_dir_tmp}/${folder}" ] && echo "${des_rpm_dir_tmp}/${folder} exist" && continue + rpm2cpio "$file" | cpio -div -D "${des_rpm_dir_tmp}/${folder}" + + unzip_des_dir_tmp="${des_rpm_dir_tmp}/${folder}/src" # 解压后的目标目录 + file_list="${des_rpm_dir_tmp}/${folder}/filelist" # 文件列表 + + # 如果目标文件夹中src文件夹不存在,则创建它 + if [ ! -d "${unzip_des_dir_tmp}" ] ; then + mkdir "${unzip_des_dir_tmp}" + chmod 775 "${unzip_des_dir_tmp}" + fi + + # 如果文件列表保存文件不存在,则创建它 + if [ ! -f "${file_list}" ] ; then + touch "${file_list}" + chmod 775 "${file_list}" + fi + + # 获取压缩文件的路径 + tar_folder="${des_rpm_dir_tmp}/${folder}" + chmod 775 "${tar_folder}" + + find "$tar_folder" \ + -name "*.tar.*" \ + -o -name "*.tar" \ + -o -name "*.tgz" \ + -o -name "*.zip" \ + -o -name "*.xpi" \ + -o -name "*.tbz" \ + -o -name "*.txz" \ + -o -name "*.7z" \ + -o -name "*.oxt" | while read -r file + do + if [ -f "${file}" ] ; then + # 解压压缩文件到目标文件夹中 + case "${file}" in + "*.tar" | "*.txz" | "*.tar.zst" | "*.tar.lzma") + tar -xf "${file}" -C "${unzip_des_dir_tmp}" + ;; + *.tar.gz) + tar -xzf "${file}" -C "${unzip_des_dir_tmp}" + ;; + "*.tar.bz2" | "*.tbz") + tar -xjf "${file}" -C "${unzip_des_dir_tmp}" + ;; + *.tar.xz) + tar -xJf "${file}" -C "${unzip_des_dir_tmp}" + ;; + *.tar.zstd) + tar -I zstd -xf "${file}" -C "${unzip_des_dir_tmp}" + ;; + "*.tar.tgz" | "*.tgz") + tar -zxf "${file}" -C "${unzip_des_dir_tmp}" + ;; + *.tar.lz) + tar -I lzip -xf "${file}" -C "${unzip_des_dir_tmp}" + ;; + *.tar.Z) + tar -xZf "${file}" -C "${unzip_des_dir_tmp}" + ;; + "*.zip" | "*.xpi" | "*.oxt") + unzip "${file}" -d "${unzip_des_dir_tmp}" + ;; + *.7z) + 7za x "${file}" -o"${unzip_des_dir_tmp}" + ;; + *) + # 此情况只会发生在数据输入错误。 + log_msg "Unsupported file format: [${file}]" + # 不应该退出,可能存在其他的压缩文件 + #exit 1 + ;; + # 数字签名文件: + # .sig + # .sign + # .minisig + # .asc + esac + fi + done + + fi + done + + log_msg "procing nums[${unzip_num}]" + } +} + + +#------------------------------------------------------------------------------- +# 功能描述:(新版本) +# 使用/usr/lib/rpm/rpmuncompress,将源目录下所有的rpm源码包,解压到目标目录下 +# (每个源码包会创建对应的源码包目录,并将源码解压到对应目录) +# 参数描述: +# $1: rpm源码包目录 +# $2: rpm源码包解压到的目录 +#------------------------------------------------------------------------------- +uncompress_src_rpm_by_rpmuncompress() +{ + [ $# -ne 2 ] && log_msg "[error] unzip_src_rpm_from_dir args num error: [$#]" && exit 1 + + local src_rpm_dir_tmp="$1" + local des_rpm_dir_tmp="$2" + + if [ ! -d "${src_rpm_dir_tmp}" ]; then + log_msg "[error] src dir [${src_rpm_dir_tmp}] is not exist or not a dir ..." + exit 1 + fi + + if [ ! -d "${des_rpm_dir_tmp}" ]; then + log_msg "[error] des dir [${des_rpm_dir_tmp}] is not exist or not a dir ..." + exit 1 + fi + + log_msg "" + log_msg "[${src_rpm_dir_tmp}] begin ... " + log_msg "" + + find "${src_rpm_dir_tmp}" -name "*.src.rpm" | { + unzip_num=0; + while read -r file; do + ((unzip_num++)) + if [ -f "$file" ]; then + filename=$(basename "$file") # 获取文件名并保存到变量 + # shellcheck disable=SC2001 + folder=$(echo "$filename" | sed 's/-[0-9].*$//') + [ -d "${des_rpm_dir_tmp}/${folder}" ] && echo "${des_rpm_dir_tmp}/${folder} exist" && continue + rpm2cpio "$file" | cpio -div -D "${des_rpm_dir_tmp}/${folder}" + + unzip_des_dir_tmp="${des_rpm_dir_tmp}/${folder}/src" # 解压后的目标目录 + local file_list="${des_rpm_dir_tmp}/${folder}/filelist" # 文件列表 + + # 如果目标文件夹中src文件夹不存在,则创建它 + if [ ! -d "${unzip_des_dir_tmp}" ] ; then + mkdir "${unzip_des_dir_tmp}" + chmod 775 "${unzip_des_dir_tmp}" + fi + + # 如果文件列表保存文件不存在,则创建它 + if [ ! -f "${file_list}" ] ; then + touch "${file_list}" + chmod 775 "${file_list}" + fi + + # 获取压缩文件的路径 + tar_folder="${des_rpm_dir_tmp}/${folder}" + chmod 775 "${tar_folder}" + + find "$tar_folder" -regextype posix-extended -regex ".*\.(tar|tar.*|tgz|zip|xpi|tbz|txz|7z|oxt|rar|tbz2|whl)" | while read -r file + do + if [ -f "${file}" ] ; then + # 解压压缩文件到目标文件夹中 + cp -rf "${file}" "${unzip_des_dir_tmp}" + cd "${unzip_des_dir_tmp}" || continue + /usr/lib/rpm/rpmuncompress -x "${file}" + rm -f "${unzip_des_dir_tmp}/${file}" + fi + done + + fi + done + + log_msg "procing nums[${unzip_num}]" + } +} + +#------------------------------------------------- +# 功能描述: +# 根据repomd.xml下载primary.xml +# 参数说明: +# $1: repomd.xml的下载链接 +# $2: 存放promary的路径 +#------------------------------------------------- +download_primary_xml_by_repomdxml() +{ + if [ $# -ne 2 ]; then + echo "[error] download_primary_xml_by_repomdxml args num error: [$#]" + fi + + local repo_url="$1" + local des_path="$2" + local org_path + org_path=$(pwd) + + if [ -z "${repo_url}" ]; then + echo "[warn] repomd.xml url is empty: [${repo_url}]" + return 0 + fi + + if [ ! -d "${des_path}" ]; then + mkdir -p "${des_path}" + chmod 775 "${des_path}" + fi + echo "[log] work path: [${des_path}]" + + cd "${des_path}" || exit 1 + + rm -f repomd.xml + + # 下载repomd.xml文件 + wget "${repo_url}" + if [ ! -f "./repomd.xml" ]; then + log_msg "[error] wget repomd.xml error[${repo_url}] ..." + return 1 + fi + + # 过滤出软件信息文件 + primary_xml_gz_file_name=$(grep "primary.xml.gz" ./repomd.xml) + if [ -z "${primary_xml_gz_file_name}" ]; then + primary_xml_gz_file_name=$(grep "primary.xml" ./repomd.xml | head -n 1) + fi + primary_xml_gz_file_name=$(echo "${primary_xml_gz_file_name}" | awk -F'/' '{print $2}' | awk -F'"' '{print $1}') + + # shellcheck disable=SC2001 + primary_xml_file_name=$(echo "${primary_xml_gz_file_name}" | sed 's/\.[^.]*$//') + + if [ -f "${primary_xml_file_name}" ]; then + log_msg "[warn] repomd.xml exist: ${des_path}/${primary_xml_file_name}" + return 0 + fi + [ -f "${primary_xml_gz_file_name}" ] && rm -f "${primary_xml_gz_file_name}" + + # 下载软件信息文件 + url_base_primary=$(echo "${repo_url}" | awk -F'/repomd.xml' '{print $1}') + url_primary_xml_file="${url_base_primary}/${primary_xml_gz_file_name}" + wget "${url_primary_xml_file}" + if [ ! -f "${primary_xml_gz_file_name}" ]; then + log_msg "[error] wget primary.xml error: [${url_primary_xml_file}] ..." + return 1 + fi + gzip -d "${primary_xml_gz_file_name}" + + [ -f repomd.xml ] && rm -f repomd.xml + [ -f "${primary_xml_gz_file_name}" ] && rm -f "${primary_xml_gz_file_name}" + + cd "$org_path" || exit 1 +} + + +#------------------------------------------------- +# 功能描述: +# 根据源码包的repomd.xml下载.src.rpm源码包 +# 参数说明: +# $1: repomd.xml的下载链接 +# $2: 存放promary的路径 +#------------------------------------------------- +download_rpm_by_repomdxml() +{ + if [ $# -ne 2 ]; then + echo "[error] download_rpm_by_repomdxml args num error: [$#]" + fi + + local repo_url="$1" + local des_path="$2" + local org_path + org_path=$(pwd) + + if [ -z "${repo_url}" ]; then + echo "[warn] repomd.xml url is empty: [${repo_url}]" + return 0 + fi + + if [ ! -d "${des_path}" ]; then + mkdir -p "${des_path}" + chmod 775 "${des_path}" + fi + echo "[log] des path: [${des_path}]" + + cd "${des_path}" || exit 1 + + # 下载元数据信息文件 + primary_xml_gz_file_name="" + primary_xml_file_name="" + download_primary_xml_by_repomdxml "${repo_url}" "${des_path}" + + # 下载源码包 + url_rpm_base_path=$(echo "${repo_url}" | awk -F'/repodata' '{print $1}') + grep href= "${primary_xml_file_name}" | awk -F'"' '{print $2}' | while read -r line + do + rpm_name=$(echo "$line" | awk -F'/' '{print $NF}') + if [ -f "${rpm_name}" ]; then + log_msg "file exist [${rpm_name}]" + continue + fi + + url_rpm_path="${url_rpm_base_path}/${line}" + wget "${url_rpm_path}" + done + + [ -f repomd.xml ] && rm -f repomd.xml + [ -f "${primary_xml_gz_file_name}" ] && rm -f "${primary_xml_gz_file_name}" + [ -f "${primary_xml_file_name}" ] && rm -f "${primary_xml_file_name}" + + cd "$org_path" || exit 1 +} diff --git a/package_mapping/similarity_calculator-new.py b/package_mapping/similarity_calculator-new.py new file mode 100644 index 0000000000000000000000000000000000000000..b984cc8102666a4ef304f2c8e0fe2f640af4652e --- /dev/null +++ b/package_mapping/similarity_calculator-new.py @@ -0,0 +1,356 @@ +import similarity_calculator_common +import json +import os +import re +import sys +from tqdm import tqdm + +from sklearn.metrics.pairwise import cosine_similarity +from fastembed import TextEmbedding +from openai import OpenAI + +sys.path.append('..') +import config.constant as config +import lib.lib_py_es as SelfLib + +#------------------------------------------------------------- +# 功能描述: +# 计算rpm源码包存在相同source0时的,两包之间的相似度 +# 参数说明: +# source_list:source_list为源系统和目标系统存在相同的source0的清单 +# src_sys:源系统 +# des_sys:目标系统 +# file_json:源系统merge之后的特征文件 +# file_json2:目标系统merge之后的特征文件 +# +# 说明: +# 1. 本工具支持source0有版本和无版本匹配 +# 2. 本工具相似度计算,支持原算法和embeding模型 +# 注意事项: +# 1. 问题1:"fastembed.common.model_management:download_model:248 - Could not download model from HuggingFace: An error happened while trying to locate the files on the Hub and we cannot find the appropriate snapshot folder for the specified revision on the local disk. Please check your internet connection and try again. Falling back to other sources." +# 表示下载模型失败, 则需要提前设置环境变量 "HF_ENDPOINT"(代码执行之前); +# 2. 问题2:"elasticsearch.UnsupportedProductError: The client noticed that the server is not Elasticsearch and we do not support this unknown product" +# 可能是elasticsearch的版本和es数据库版本不一致造成的 +#------------------------------------------------------------- + + + +# 如果调用有问题,需要提前设置环境变量 "HF_ENDPOINT" +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" + +embedding_model = TextEmbedding(model_name='BAAI/bge-large-en-v1.5', batch_size=32) + +def get_embedding_local(text): + global embedding_model + text = text.replace("\n", " ") + embeddings_generator = embedding_model.embed(text) + embeddings_list = list(embeddings_generator) + return embeddings_list[0] + +# 如果调用有问题,需要提前设置环境变量 "HF_ENDPOINT" +def embedding_calculate_similarity_local(featureInfo1, featureInfo2): + feature_str1 = re.sub(r'[{}[\]()@#.\':\/-]', '', str(featureInfo1)) + feature_str2 = re.sub(r'[{}[\]()@#.\':\/-]', '', str(featureInfo2)) + + vec1 = get_embedding_local(feature_str1) + vec2 = get_embedding_local(feature_str2) + similarity = cosine_similarity([vec1], [vec2])[0][0] + return {"totalSim": round(similarity, 4)} + + +AI_API_KEY="sk-xxxxxx" +client = OpenAI(api_key = AI_API_KEY, base_url = "https://pro.aiskt.com/v1/") + +def get_embedding(text, model="text-embedding-3-small"): + text = text.replace("\n", " ") + return client.embeddings.create(input = [text], model=model).data[0].embedding + +def embedding_calculate_similarity(featureInfo1, featureInfo2): + feature_str1 = re.sub(r'[{}[\]()@#.\':\/-]', '', str(featureInfo1)) + feature_str2 = re.sub(r'[{}[\]()@#.\':\/-]', '', str(featureInfo2)) + + vec1 = get_embedding(feature_str1) + vec2 = get_embedding(feature_str2) + similarity = cosine_similarity([vec1], [vec2])[0][0] + return {"totalSim": round(similarity, 4)} + + +def calculate_similarity(featureInfo1, featureInfo2): + """ + 计算特征之间的相似度 + + 参数: + featureInfo1 (dict): 特征信息1,以特征名为键,特征值为值的字典 + featureInfo2 (dict): 特征信息2,以特征名为键,特征值为值的字典 + + 返回: + dict: 特征相似度字典,以特征名为键,相似度值为值,另外有个键为totalSim,值为加权的特征总相似度 + + """ + feature_weights = config.FEATURE_WEIGHT_MAP + total_similarity = 0.0 + similarity_map = {} + for feature in set(featureInfo1.keys()) & set(featureInfo2.keys()): + input1_value = featureInfo1[feature] + input2_value = featureInfo2[feature] + # 只有在FEATURE_WEIGHT中设置了权重的特征才会计算相似度 + if feature in feature_weights: + weight = feature_weights[feature] + similarity = round(calculate_feature_similarity(feature, input1_value, input2_value), 4) + if feature == "url": + if similarity > 0.5: + weighted_similarity = similarity * weight + else: + weighted_similarity = 0 + elif feature == "source0": + if similarity > 0.5: + weighted_similarity = similarity * weight + else: + weighted_similarity = 0 + else: + weighted_similarity = similarity * weight + total_similarity += weighted_similarity + similarity_map[feature] = similarity + similarity_map["totalSim"] = round(total_similarity, 4) + return similarity_map + + +def calculate_feature_similarity(feature_name, input1, input2): + """ + 计算特征相似度的通用函数 + + 参数: + feature_name (str): 特征名 + input1: 特征1的输入值 + input2: 特征2的输入值 + + 返回: + float: 特征相似度值 + + 异常: + ValueError: 如果找不到特征名对应的相似度函数 + + """ + if input1 == '' or input2 == '': + return 0 + if feature_name in similarity_functions: + similarity_function = similarity_functions[feature_name] + return similarity_function(input1, input2) + else: + raise ValueError("Similarity function for feature '{}' not found.".format(feature_name)) + + +def calculate_name_similarity(input1, input2): + return similarity_calculator_common.levenshtein_similarity(input1, input2) + + +def calculate_version_similarity(input1, input2): + return similarity_calculator_common.levenshtein_similarity(input1, input2) + + +def calculate_summary_similarity(input1, input2): + return similarity_calculator_common.cosine_similarity(input1, input2) + + +def calculate_description_similarity(input1, input2): + return similarity_calculator_common.cosine_similarity(input1, input2) + + +def calculate_url_similarity(input1, input2): + return similarity_calculator_common.levenshtein_similarity(input1, input2) + + +def calculate_requires_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_provides_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_binary_packages_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_src_filelist_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_build_requires_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_source0_similarity(input1, input2): + return similarity_calculator_common.levenshtein_similarity(input1, input2) + + +def calculate_macro_names_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_email_names_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_class_names_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_path_names_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_url_names_similarity(input1, input2): + return similarity_calculator_common.jaccard_similarity(input1, input2) + + +def calculate_recommends_similarity(input1, input2): + return similarity_calculator_common.difflib_similarity(input1, input2) + + +def calculate_suggests_similarity(input1, input2): + return similarity_calculator_common.difflib_similarity(input1, input2) + + +def calculate_filelist_similarity(input1, input2): + set1 = set(input1) + set2 = set(input2) + intersection = set1.intersection(set2) + union = set1.union(set2) + + if len(union) == 0: + similarity = 0.0 + else: + similarity = len(intersection) / len(union) + + return similarity + # return similarity_calculator_common.jaccard_similarity(input1,input2) + + +""" +每个键都是一个特征名,对应的值是一个相似度计算函数。这些函数用于计算相应特征的相似度。 +可以根据具体的特征名从字典中获取相应的相似度计算函数 +""" +similarity_functions = { + "name": calculate_name_similarity, + "version": calculate_version_similarity, + #"summary": calculate_summary_similarity, + #"description": calculate_description_similarity, + "url": calculate_url_similarity, + "requires": calculate_requires_similarity, + # "recommends": calculate_recommends_similarity, + # "suggests": calculate_suggests_similarity, + "provides": calculate_provides_similarity, + "binaryList": calculate_binary_packages_similarity, + # "src_filelist": calculate_src_filelist_similarity, + "buildRequires": calculate_build_requires_similarity, + "source0": calculate_source0_similarity, + "macro_names": calculate_macro_names_similarity, + "email_names": calculate_email_names_similarity, + "class_names": calculate_class_names_similarity, + "path_names": calculate_path_names_similarity, + "url_names": calculate_url_names_similarity, + "filelist": calculate_filelist_similarity +} + + +# source_list为源系统和目标系统存在相同的source0的清单 +source_list = "/root/pkg-mapping/analyse/res/similarity/new/no-version/same_source_packages.txt" + +# 源系统、目标系统 +src_sys="centos-new@9-stream~src" +des_sys="openeuler-new@24.03~src" + +# merge之后的特征文件 +file_json = "/root/pkg-mapping/analyse/json/new/centos_9-stream_merge.json" +file_json2 = "/root/pkg-mapping/analyse/json/new/openeuler_openEuler-24.03-LTS_merge.json" + +if __name__ == '__main__': + + with open(file_json, 'r') as f: + json_data = f.read() + data1 = json.loads(json_data) + with open(file_json2, 'r') as f: + json_data2 = f.read() + data2 = json.loads(json_data2) + + source_arr = [] + with open(source_list, 'r') as file: + for line in file: + line = re.sub(r'\n', '', line) + if line and line not in source_arr: + source_arr.append(line) + + similarity_num_src = {} + similarity_num_all = {} + similarity_num_one = {} + similarity_min = 1.0 + for src in tqdm(source_arr): + # source0不去掉版本进行匹配,以及计算相似度 + src_names=SelfLib.es_search_record_by_name_term(src_sys, "source0", src, "name", 10) + des_names=SelfLib.es_search_record_by_name_term(des_sys, "source0", src, "name", 10) + + # source0去掉版本进行匹配,以及计算相似度 + #src_tmp=re.sub(r'{', r'\\{', src) + #src_tmp=re.sub(r'}', r'\\}', src_tmp) + #src_names=SelfLib.es_search_record_by_name_regexp(src_sys, "source0", src_tmp + '[^/]*', "name", 10) + #des_names=SelfLib.es_search_record_by_name_regexp(des_sys, "source0", src_tmp + '[^/]*', "name", 10) + + dict_tmp = {} + for src_name in src_names: + one_src_to_des = {} + for des_name in des_names: + flag1 = False + featureInfo1 = {} + for key in data1: + if data1[key]["name"] == src_name: + flag1 = True + featureInfo1 = data1[key] + flag2 = False + featureInfo2 = {} + for key in data2: + if data2[key]["name"] == des_name: + flag2 = True + featureInfo2 = data2[key] + if not flag1: + print("无源包名:" + src_name) + if not flag2: + print("无目标包名:" + des_name) + if flag1 and flag2: + # 使用本地embeding模型计算相似度 + similarity_tmp = embedding_calculate_similarity_local(featureInfo1, featureInfo2) + # 使用原始算法计算相似度 + #similarity_tmp = calculate_similarity(featureInfo1, featureInfo2) + key_name = src_name + ' -> ' + des_name + if similarity_tmp: + dict_tmp[key_name] = similarity_tmp['totalSim'] + else: + dict_tmp[key_name] = 0 + similarity_num_all[key_name] = dict_tmp[key_name] + one_src_to_des[des_name] = dict_tmp[key_name] + + if similarity_min > dict_tmp[key_name]: + similarity_min = dict_tmp[key_name] + one_src_to_des_list = list(sorted(one_src_to_des.items(), key=lambda item: item[1], reverse=True)) + similarity_num_one[src_name + ' -> ' + one_src_to_des_list[0][0]] = one_src_to_des_list[0][1] + similarity_num_src[src] = dict(sorted(dict_tmp.items(), key=lambda item: item[1], reverse=True)) + similarity_num_all_sort = dict(sorted(similarity_num_all.items(), key=lambda item: item[1], reverse=True)) + similarity_num_one_sort = dict(sorted(similarity_num_one.items(), key=lambda item: item[1], reverse=True)) + + for src in similarity_num_src: + print(f"{src}\n\t{similarity_num_src[src]}") + + print("\n\n") + for item in similarity_num_all_sort: + print(f"[{item}]: {similarity_num_all_sort[item]}") + + print("\n\n") + num_total = 0 + num_high = 0 + for item in similarity_num_one_sort: + print(f"[{item}]: {similarity_num_one_sort[item]}") + num_total += 1 + if similarity_num_one_sort[item] >= 0.9: + num_high += 1 + + print(f"\ntotal: {num_total}, over 0.9: {num_high}, less then 0.9: {num_total - num_high}") diff --git a/unzip_package/unzip_openeuler_openEuler-24.03-LTS.sh b/unzip_package/unzip_openeuler_openEuler-24.03-LTS.sh new file mode 100644 index 0000000000000000000000000000000000000000..f751bc0a7f388a0fd84c69e2d4257daa9a3e2659 --- /dev/null +++ b/unzip_package/unzip_openeuler_openEuler-24.03-LTS.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +source ./../lib/lib_rpm.sh + +base_dir="/home/lxb/pkgmapping/openeuler/" +version="openEuler-24.03-LTS" + +# 个人下载源码仓 +src_rpm_base_dir_local="${base_dir}/${version}/srpm" +src_rpm_dirs_local=("update-testing-Everything") + +log_file="${base_dir}/${version}/log/unzip-log" +if [ ! -d "${base_dir}/${version}/log" ]; then + mkdir -p "${base_dir}/${version}/log" + chmod 775 "${base_dir}/${version}/log" +fi +log_msg() +{ + echo "$@" | tee -a "${log_file}" +} + +des_dir="${base_dir}/${version}/repo" +if [ ! -d "${des_dir}" ]; then + mkdir -p "${des_dir}" + chmod 775 "${des_dir}" +fi + +for dir in "${src_rpm_dirs_local[@]}" +do + #unzip_src_rpm_from_dir_old "${src_rpm_base_dir_local}/${dir}" "${des_dir}" + uncompress_src_rpm_by_rpmuncompress "${src_rpm_base_dir_local}/${dir}" "${des_dir}" +done diff --git a/utils/check-correct-rate-same-source.py b/utils/check-correct-rate-same-source.py new file mode 100644 index 0000000000000000000000000000000000000000..36782ff871f7fe5edc395aa5083c0e2c2cd59055 --- /dev/null +++ b/utils/check-correct-rate-same-source.py @@ -0,0 +1,106 @@ +import json +import os +import sys +import re + +sys.path.append('..') +from utils.es_util import ES as es +import lib.lib_py_es as SelfLib + +#------------------------------------------------- +# 功能描述: +# 检查相同source0下,rpm包映射的正确率 +# 说明: +# 1. 支持source0有版本和无版本 +#------------------------------------------------- + +src_sys="fedora-old@40~src" +des_sys="openeuler-old@24.03~src" + +# 此处不使用_final文件,因为_final文件是去除重名文件的 +mapping_file = "/root/pkg-mapping/analyse/data/src/centos-new@9-stream-to-openeuler-new@24.03.yml" + +# 具有相同source0时,源系统源码包的包名 +same_source_file = "/root/pkg-mapping/analyse/res/similarity/new/version/same_source_packages.txt" + +# 从匹配文件中读取数据 +mapping_package = {} +with open(mapping_file, 'r') as file: + for line in file: + record = line.rstrip('\n') + split1 = record.split(':') + src_name = split1[0] + des_names = split1[1].split(' ') + if src_name in mapping_package: + print("[{}] package exist ...".format(src_name)) + continue + + content = [item for item in des_names if item or item == 0] + mapping_package[src_name] = content + +# 从same_name中获取数据 +match_succ_num = 0 +match_fail_num = 0 +proc_num = 0 +with open(same_source_file, 'r') as file: + for line in file: + proc_num += 1 + + src_name = line.rstrip('\n') + # 根据name查询,获取到source + sources = SelfLib.es_search_record_by_name_term(src_sys, 'name', src_name, 'source0') + + match_flag = False + name_match_falg = False + for source in sources: + # 根据source查询,获取目标包名 + # 有版本信息 + #des_names = SelfLib.es_search_record_by_name_term(des_sys, 'source0', source, 'name') + + # 无版本信息 + source_tmp = SelfLib.get_inf_ignore_version(source) + source_tmp=re.sub(r'{', r'\\{', source_tmp) + source_tmp=re.sub(r'}', r'\\}', source_tmp) + source_tmp = source_tmp + '[^/]*' + des_names = SelfLib.es_search_record_by_name_like(des_sys, 'source0', source_tmp, 'name') + + # 使用正则查询,可能存在查询失败的情况(则进行补充查询) + if not des_names : + des_names = SelfLib.es_search_record_by_name_term(des_sys, 'source0', source, 'name') + if not des_names : + source_tmp = SelfLib.get_inf_ignore_version(source) + source_tmp = source_tmp + '*' + des_names = SelfLib.es_search_record_by_name_like(des_sys, 'source0', source_tmp, 'name') + + # 检查匹配文件中是否存在对应的映射 + for des_name in des_names: + if src_name in mapping_package: + match_content = mapping_package[src_name] + if des_name in match_content: + # 匹配成功 + match_flag = True + with open("sorce_match.txt", 'a') as res_file: + res_file.write(f"{src_name} {source} {des_name}\n") + break + + if src_name == des_name: + name_match_falg = True + + if match_flag: + match_succ_num += 1 + break + else : + print("source not mapping package: [{}]-[{}]-[{}]".format(src_name, source, des_names)) + + if not match_flag: + print("name not mapping package: [{}]-[{}]".format(src_name, sources)) + match_fail_num += 1 + + if name_match_falg : + print("\nnot mapping, but name equele ...\n") + + + +print("source相同数量: {}".format(proc_num)) +print("匹配成功: {} - {:.2f}%".format(match_succ_num, match_succ_num*100.0/proc_num)) +print("匹配失败: {} - {:.2f}%".format(match_fail_num, match_fail_num*100.0/proc_num)) diff --git a/utils/match-rpm-name-source-name_to_name.py b/utils/match-rpm-name-source-name_to_name.py new file mode 100644 index 0000000000000000000000000000000000000000..da8c20b5bc360874a8e443d615a3978ba64fd983 --- /dev/null +++ b/utils/match-rpm-name-source-name_to_name.py @@ -0,0 +1,69 @@ +import json +import os +import re +import sys + +sys.path.append('..') +import lib.lib_py_es as SelfLib + +#------------------------------------------------- +# 功能描述: +# 根据特征文件,获取具有相同source0的清单(system1 rpm name : system2 rpm name) +# 注:可使用有版本和无版本 +#------------------------------------------------- + +src_file_1="/root/pkg-mapping/analyse/json/old/fedora_40_merge.json" +src_file_2="/root/pkg-mapping/analyse/json/old/openeuler_openEuler-24.03-LTS_merge.json" + +# 结果目录 +res_path="an-res" + +if not os.path.exists(res_path) or not os.path.isdir(res_path): + os.makedirs(res_path) + +# 打开并读取 JSON 文件 +with open(src_file_1, 'r') as file: + json_data = file.read() + +with open(src_file_2, 'r') as file: + des_json_data = file.read() + +# 解析 JSON 数据 +src_data = json.loads(json_data) +des_data = json.loads(des_json_data) + +# Extract all names and sources +names_and_sources_src = [(item['name'], item['source0']) for item in src_data.values()] + + +des_source0_to_names = {} +for item in des_data.values(): + name = item['name'] + # source0无版本 + source0 = SelfLib.get_source_ignore_version(item['source0']) + # source0有版本 + #source0 = item['source0'] + if source0 in des_source0_to_names: + if name not in des_source0_to_names[source0]: + des_source0_to_names[source0].append(name) + else: + des_source0_to_names[source0]=[name] + +same_names=[] +same_source0_names = [] +for name, source in names_and_sources_src: + if name not in same_names: + same_names.append(name) + # source0无版本 + source = SelfLib.get_source_ignore_version(source) + # source0有版本 + #source = source + if source and source in des_source0_to_names: + for name_tmp in des_source0_to_names[source]: + same_source0_names.append(name + " " + name_tmp) + +with open(os.path.join(res_path, "same_source_name_packages-new-noversion.txt"), "w") as file: + for item in same_source0_names: + file.write(f"{item}\n") + +print("sourcelen: {}".format(len(same_source0_names))) \ No newline at end of file diff --git a/utils/match-rpm-name-source.py b/utils/match-rpm-name-source.py new file mode 100644 index 0000000000000000000000000000000000000000..9cc405aa54b92f4ea1ef21ac8dc7744b181f14d2 --- /dev/null +++ b/utils/match-rpm-name-source.py @@ -0,0 +1,93 @@ +import json +import os +import re +import sys + +sys.path.append('..') +import lib.lib_py_es as SelfLib + +#------------------------------------------------- +# 功能描述: +# 根据特征文件,获取具有相同source0的源系统rpm name清单(system1 rpm name) +# 注:可使用有版本和无版本 +#------------------------------------------------- + +src_file_1="/root/pkg-mapping/analyse/json/old/fedora_40_merge.json" +src_file_2="/root/pkg-mapping/analyse/json/old/openeuler_openEuler-24.03-LTS_merge.json" + +# 结果目录 +res_path="an-res" + +if not os.path.exists(res_path) or not os.path.isdir(res_path): + os.makedirs(res_path) + +# 打开并读取 JSON 文件 +with open(src_file_1, 'r') as file: + json_data = file.read() + +with open(src_file_2, 'r') as file: + des_json_data = file.read() + +# 解析 JSON 数据 +src_data = json.loads(json_data) +des_data = json.loads(des_json_data) + +# source0存在版本 +names_and_sources = [(item['name'], item['source0']) for item in src_data.values()] +des_names = [item['name'] for item in des_data.values()] +des_sources = [item['source0'] for item in des_data.values()] + +# 忽略source0的版本 +#names_and_sources = [(item['name'], SelfLib.get_source_ignore_version(item['source0'])) for item in src_data.values()] +#des_names = [item['name'] for item in des_data.values()] +#des_sources = [SelfLib.get_source_ignore_version(item['source0']) for item in des_data.values()] + +total_name = [] +for name, source in names_and_sources: + if name not in total_name: + total_name.append(name) + +print("len: {}, len: {}, len total: {}".format(len(names_and_sources), len(des_names), len(total_name))) + +same_names = [] +same_source_names = [] +same_sources = [] +same_name_sources = [] +diff_names = [] +# Print the extracted names and sources +for name, source in names_and_sources: + if source in des_sources and len(source)>0: + if name not in same_source_names: + same_source_names.append(name) + same_name_sources.append(name + " " + source) + if source not in same_sources: + same_sources.append(source) + + elif name in des_names: + if name not in same_names: + same_names.append(name) + else: + if name not in diff_names: + diff_names.append(name) + +with open(os.path.join(res_path, "same_source_name_packages.txt"), "w") as file: + for name in same_source_names: + file.write(f"{name}\n") + +with open(os.path.join(res_path, "same_name_source_packages.txt"), "w") as file: + for name in same_name_sources: + file.write(f"{name}\n") + +with open(os.path.join(res_path, "same_source_packages.txt"), "w") as file: + for source in same_sources: + file.write(f"{source}\n") + +with open(os.path.join(res_path, "same_name_packages.txt"), "w") as file: + for name in same_names: + file.write(f"{name}\n") + +with open(os.path.join(res_path, "diff_packages.txt"), "w") as file: + for name in diff_names: + file.write(f"{name}\n") + +print("source_name len: {}, sourcelen: {}, name len: {}, diff len: {}".format(len(same_source_names), len(same_sources), len(same_names), len(diff_names))) \ No newline at end of file diff --git a/utils/repair-rpm-mapping.sh b/utils/repair-rpm-mapping.sh new file mode 100644 index 0000000000000000000000000000000000000000..e444ba292e9aa32f73063f6f888cffc32befb602 --- /dev/null +++ b/utils/repair-rpm-mapping.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +#----------------------------------------------------- +# 功能描述:当前文件用于支持rpm包名映射相关功能 +#----------------------------------------------------- + +# log日志文件,spec文件 + +# 工作目录 +rpm_mapping_work="${HOME}/repair-dir/work" +rpm_repair_mapping_utils_path="/lkp/lkp/src/programs/rpmbuild/utils" + +rpm_package_name_new_tmp="" + +#----------------------------------------------------------- +# 功能描述:根据二进制包映射关系,找到一个新的包 +# 参 数: +# 参数1:原rpm包名 +# 参数2:映射关系 +#----------------------------------------------------------- +get_new_rpm_package_name() +{ + rpm_package_name_new_tmp="" + [ $# -lt 2 ] && echo "[error] get_new_rpm_package_name arg nums error: [$#]" && exit 1 + + rpm_package_name="$1" + des_rpm_name_record="$2" + + [ -z "${rpm_package_name}" ] && echo "[error] orginal rpm package name is empty [${rpm_package_name}]" && exit 1 + [ -z "${des_rpm_name_record}" ] && echo "[error] des rpm binary is empty [${des_rpm_name_record}]" && exit 1 + + # 调用python脚本 + #pip install Levenshtein + commond="python ${rpm_repair_mapping_utils_path}/rpm-package-mapping.py -s \"${rpm_package_name}\" -d \"${des_rpm_name_record}\" -b 0.6" + echo "[log] comond: ${commond}" + mapping_proc_res=$(bash -c "${commond}") + echo "[log] rpm package mapping log: " + echo "${mapping_proc_res}" + mapping_proc_res=$(echo "${mapping_proc_res}" | tr '\n' ' ') + read -ra mapping_proc_res_arr <<< "${mapping_proc_res}" + mapping_proc_res=${mapping_proc_res_arr[-1]} + echo "[log] rpm package mapping result: ${mapping_proc_res}" + + # 检查修复结果 + if [[ ${mapping_proc_res} == SIMILARITY-MATCH:\[SUCCESS\]-* ]]; then + rpm_package_name_new=$(echo "${mapping_proc_res}" | awk -F'\\]-\\[' '{print $2}' | awk -F'\\]' '{print $1}') + rpm_package_name_new_tmp="${rpm_package_name_new}" + else + # 失败 + echo "[error] rpm package mapping fail : [rpm_package_name]" + exit 1 + fi +} + + +#---------------------------------------------------------------------------- +# 功能描述:利用包名映射修复spec文件 +# 参 数: +# -s:源spec文件路径(包含文件名) +# -l:源log文件路径(包含文件名) +# -d:目标spec文件存放路径(不包括文件名) +# -b:备份路径(不包含文件名) +# -m:映射文件路径(包含文件名) +# -t:映射类型(one: 将错误的rpm替换;all: 将错误rpm映射到的rpm全部加入) +# -w: 工作目录 +#---------------------------------------------------------------------------- +repair_spec_by_rpm_mapping() +{ + local spec_file="" + local log_file="" + local repair_spec_dir="" + local bak_dir="" + local mapping_file="" + local type="" + local work_dir="" + + # 参数获取 + while getopts ":s:l:d:b:m:t:w:" opt; + do + case $opt in + s) + spec_file=$OPTARG + ;; + l) + log_file=$OPTARG + ;; + d) + repair_spec_dir=$OPTARG + ;; + b) + bak_dir=$OPTARG + ;; + m) + mapping_file=$OPTARG + ;; + t) + type=$OPTARG + ;; + w) + work_dir=$OPTARG + ;; + \?) + echo "[error] invalid option: -$OPTARG" >&2 + exit 1 + ;; + :) + echo "[error] option -$OPTARG requires an argument." >&2 + exit 1 + ;; + esac + done + + # 参数检查 + files=("${spec_file}" "${log_file}" "${mapping_file}") + for file in "${files[@]}" + do + [ ! -f "${file}" ] && echo "[error] file not exist: [${file}]" && exit 1 + done + + [ -n "${work_dir}" ] || work_dir="${rpm_mapping_work}" + + paths=("${repair_spec_dir}" "${bak_dir}" "${work_dir}") + for path in "${paths[@]}" + do + if [ ! -d "${path}" ]; then + mkdir -p "${path}" + fi + done + + if [ -z "${type}" ]; then + echo "[error] need option -t, to appoint replace rm package type" + exit 1 + elif [ 'one' != "${type}" ] && [ 'all' != "${type}" ]; then + echo "[error] option -t, the value must be one or all" + exit 1 + fi + + # spec文件名 + spec_name=$(basename "${spec_file}") + + # 准备工作环境和文件 + rm -rf "${work_dir}" + mkdir -p "${work_dir}" + + #[ ! -f "${work_dir}/${spec_name}-process" ] && touch "${work_dir}/${spec_name}-process" + + # 日志处理 (获取有哪些包安装失败) + rpm_package_names=$(grep -o "No matching package to install.*" "${log_file}" | sort | uniq | while read -r line + do + # shellcheck disable=SC2001 + rpm_package_name=$(echo "$line" | sed "s/No matching package to install: '\(.*\)'/\1/" | awk -F' ' '{print $1}') + echo -n "${rpm_package_name} " + done + ) + echo "[log] need mapping rpm package: ${rpm_package_names}" + + rpm_package_name_arr=() + read -ra rpm_package_name_arr <<< "${rpm_package_names}" + echo "[log] ${rpm_package_name_arr[*]}" + + # 包名替换 + mkdir -p "${work_dir}/spec-src" + mkdir -p "${work_dir}/spec-repair" + local spec_file_old="${work_dir}/spec-src/${spec_name}" + local spec_file_new="${work_dir}/spec-repair/${spec_name}" + local spec_file_tmp="${work_dir}/spec-repair/spec-tmp" + cp -f "${spec_file}" "${spec_file_old}" + cp -f "${spec_file}" "${spec_file_new}" + + for rpm_package_name in "${rpm_package_name_arr[@]}" + do + echo "[log] mapping rpm packkage: [${rpm_package_name}] ..." + + # 查询匹配记录 + des_rpm_name_record=$(grep "${rpm_package_name}[[:space:]]*:[[:space:]]*" "${mapping_file}" | sed "s/^[[:space:]]*${rpm_package_name}[[:space:]]*:[[:space:]]*//") + + if [ -z "${des_rpm_name_record}" ]; then + echo "no mapping any rpm packkage: [${rpm_package_name}]-[${des_rpm_name_record}] " + exit 1 + fi + + if [ "one" == "${type}" ]; then + # 找到一个合适的包名 + rpm_package_name_new_tmp="" + get_new_rpm_package_name "${rpm_package_name}" "${des_rpm_name_record}" + new_rpm_package_name="${rpm_package_name_new_tmp}" + if [ -z "${new_rpm_package_name}" ]; then + echo "[error] rpm package name to mapping : [${new_rpm_package_name}]" + exit 1 + fi + + echo "[log] replace rpm package [${rpm_package_name}] to [${new_rpm_package_name}]" + + # sed -i '/^${rpm_package_name}/s/$/ ${yyy}' "${work_dir}/${spec_name}-process" + sed "s/\(:[[:space:]]*\)${rpm_package_name}\([[:space:]]*$\)/\1${new_rpm_package_name}\2/g" "${spec_file_new}" > "${spec_file_tmp}" + + elif [ "all" == "${type}" ]; then + #line_num=$(grep ":[[:space:]]*${rpm_package_name}[[:space:]]*$" "${spec_file_new}" | sort | uniq | wc -l) + # 考虑情况: + # 1. ":[[:space:]]*包名" + # 2. ":[[:space:]]*包名1,[[:space:]]*包名2,[[:space:]]*包名3" + # 3. ":[[:space:]]*包名 > xx.yy.zz" + # 4. "Requires(pre):[[:space:]]*包名" + # 5. "存在宏" + line_num=$(grep -E ":[[:space:]]*${rpm_package_name}([[:space:]]*$|[[:space:]]+)" "${spec_file_new}" | sort | uniq | wc -l) + if [ "${line_num}" -ne 1 ]; then + echo "[error] ${rpm_package_name} match many record [${line_num}]" + ecit 1 + fi + + #require_type=$(grep ":[[:space:]]*${rpm_package_name}[[:space:]]*$" "${spec_file_new}" | awk -F':' '{print $1}') + require_type=$(grep -E ":[[:space:]]*${rpm_package_name}([[:space:]]*$|[[:space:]]+)" "${spec_file_new}" | awk -F':' '{print $1}') + + # 替换内容 + des_replace_content="" + des_rpm_name_arr=() + read -ra des_rpm_name_arr <<< "${des_rpm_name_record}" + for des_rpm_name in "${des_rpm_name_arr[@]}" + do + des_replace_content="${des_replace_content}\n${require_type}: ${des_rpm_name}" + done + + sed "s/${require_type}:[[:space:]]*${rpm_package_name}[[:space:]]*$/${des_replace_content}/g" "${spec_file_new}" > "${spec_file_tmp}" + fi + mv "${spec_file_tmp}" "${spec_file_new}" + done + cp -f "${spec_file_new}" "${repair_spec_dir}/${spec_name}" + cp -rf "${work_dir}"/* "${bak_dir}"/ + + echo "[log] SUCCESS: repair spec by rpm mapping..." +} diff --git a/utils/rpm-binary-match-binary.py b/utils/rpm-binary-match-binary.py new file mode 100644 index 0000000000000000000000000000000000000000..49770f6c23eb75d69ea7719f078c82a926478bd5 --- /dev/null +++ b/utils/rpm-binary-match-binary.py @@ -0,0 +1,219 @@ +import os +import re +import xml.etree.ElementTree as ET + +##--------------------------------------------------------------------------------- +## 功能描述: +## 将二进制包名根据源码包的特征映射,生成二进制包名到二进制包名的映射清单 +## 输出: +## 二进制包名的映射文件 +## 文件格式:(因为版本问题,可能映射到不同的源码包) +## 原系统二进制包名: 目标系统二进制包名1 目标系统二进制包名2 +##--------------------------------------------------------------------------------- + +src_sys="fedora" +des_sys="openeuler" +map_type="_old" + +src_binary_primary_xml_path = "/home/lxb/pkgmapping/fedora/40/bin-primaryxml/old" +des_binary_primary_xml_path = "/home/lxb/pkgmapping/openeuler/openEuler-24.03-LTS/bin-primaryxml/old" +source_rpm_mapping_file = "/root/pkg-mapping/analyse/data/src/fedora-old@40-to-openeuler-old@24.03.yml" + +# 输出 +res_path="res" +src_to_binary_map_file = "src_mapping_binary_" + src_sys + '_to_' + des_sys + map_type + + +''' + 功能描述: 映射关系(源码包名 -> 二进制包名) + 返回格式: + { + 架构名1: { + 源码包名1 : [二进制包名1, 二进制包名2], + ... + }, + 架构名2: { + 源码包名2 : [二进制包名1, 二进制包名2], + ... + } + } +''' +def src_mapping_binary_by_source_noversion(binary_primary_xml_path: str): + src_map_rpm = {} + for file_name in os.listdir(binary_primary_xml_path): + if not file_name.endswith('-primary.xml') : + continue + + # 解下xmL文件 + file_path = os.path.join(binary_primary_xml_path, file_name) + tree = ET.parse(file_path) + root = tree.getroot() + for package in root.findall('{http://linux.duke.edu/metadata/common}package'): + # 处理xml文件中单个package + name = package.find('{http://linux.duke.edu/metadata/common}name').text + arch = package.find('{http://linux.duke.edu/metadata/common}arch').text + source_rpm = package.find('{http://linux.duke.edu/metadata/common}format/{http://linux.duke.edu/metadata/rpm}sourcerpm').text + source_rpm = source_rpm if source_rpm else '' + source_rpm_name = re.sub(r'-[0-9].*', '', source_rpm) + source_rpm_name = source_rpm_name.strip() + + if not source_rpm_name: + #print(f'[{name}] source rpm name empty[{source_rpm_name}]') + continue + + if not arch: + print(f'[{name}] source rpm arch empty[{arch}]') + continue + + if arch not in src_map_rpm: + src_map_rpm[arch] = {} + + if source_rpm_name in src_map_rpm[arch]: + if name not in src_map_rpm[arch][source_rpm_name]: + src_map_rpm[arch][source_rpm_name].append(name) + else : + src_map_rpm[arch][source_rpm_name] = [name] + return src_map_rpm + +def src_mapping_binary_by_source(binary_primary_xml_path: str): + src_map_rpm = {} + for file_name in os.listdir(binary_primary_xml_path): + if not file_name.endswith('-primary.xml') : + continue + + # 解下xmL文件 + file_path = os.path.join(binary_primary_xml_path, file_name) + tree = ET.parse(file_path) + root = tree.getroot() + for package in root.findall('{http://linux.duke.edu/metadata/common}package'): + # 处理xml文件中单个package + name = package.find('{http://linux.duke.edu/metadata/common}name').text + arch = package.find('{http://linux.duke.edu/metadata/common}arch').text + source_rpm = package.find('{http://linux.duke.edu/metadata/common}format/{http://linux.duke.edu/metadata/rpm}sourcerpm').text + source_rpm = source_rpm if source_rpm else '' + source_rpm_name = source_rpm.strip() + + if not source_rpm_name: + #print(f'[{name}] source rpm name empty[{source_rpm_name}]') + continue + + if not arch: + print(f'[{name}] source rpm arch empty[{arch}]') + continue + + if arch not in src_map_rpm: + src_map_rpm[arch] = {} + + if source_rpm_name in src_map_rpm[arch]: + if name not in src_map_rpm[arch][source_rpm_name]: + src_map_rpm[arch][source_rpm_name].append(name) + else : + src_map_rpm[arch][source_rpm_name] = [name] + return src_map_rpm + +''' + 功能描述: 获取源系统到目标系统中,不同arch之间的包映射 + 返回格式:{ + 源系统二进制包名1 : 目标系统二进制包名1 目标系统二进制包名2 ... + 源系统二进制包名2 : 目标系统二进制包名1 目标系统二进制包名2 ... + } +''' +def getBinaryMappingBinaryByArch(src_arch: str, des_arch:str, src_map_rpm: dict, des_map_rpm: dict, source_rpm_mapping_relation: dict): + # 返回结果 + res_binary_mapping_binary = {} + + # 遍历源系统映射关系(源码包名 -> 二进制包名) + for src_name in src_src_map_rpm[src_arch]: + if src_name in source_rpm_mapping_relation: + # 源系统源码包名 -> 目标系统源码包名 + mapping_tmp = [] + for mapping_des_src_ame in source_rpm_mapping_relation[src_name]: + # 根据目标系统源码包名查找在目标系统是否存在和二进制包的映射关系 + if des_arch in des_src_map_rpm and mapping_des_src_ame in des_src_map_rpm[des_arch]: + for des_src_name in des_src_map_rpm[des_arch][mapping_des_src_ame]: + mapping_tmp.append(des_src_name) + + # 存在映射关系 + if bool(mapping_tmp): + for binary_name in src_src_map_rpm[src_arch][src_name]: + if binary_name in res_binary_mapping_binary: + res_binary_mapping_binary[binary_name].extend(mapping_tmp) + else: + res_binary_mapping_binary[binary_name] = mapping_tmp + return res_binary_mapping_binary + +''' + 功能描述:通过文件加载映射关系 +''' +def load_mapping_relation_by_file(mapping_file: str): + mapping_relation = {} + with open(mapping_file, 'r') as file: + for line in file: + record = line.rstrip('\n') + record_split = record.split(':') + src_name = record_split[0] + des_names = record_split[1].split(' ') + if src_name in mapping_relation: + print("[{}] package exist ...".format(src_name)) + continue + + content = [item for item in des_names if item or item == 0] + mapping_relation[src_name] = content + return mapping_relation + +''' + 功能描述:合并 +''' +def merge_mapping_content(src_dict: dict, des_dict: dict): + for src_name in src_dict: + if src_name not in des_dict: + des_dict[src_name] = [] + + for des_name in src_dict[src_name]: + des_dict[src_name].append(des_name) + return des_dict + +# 获取映射信息 +#src_src_map_rpm = src_mapping_binary_by_source(src_binary_primary_xml_path) +#des_src_map_rpm = src_mapping_binary_by_source(des_binary_primary_xml_path) +src_src_map_rpm = src_mapping_binary_by_source_noversion(src_binary_primary_xml_path) +des_src_map_rpm = src_mapping_binary_by_source_noversion(des_binary_primary_xml_path) + +# 加载源码映射关系 +source_rpm_mapping_relation = load_mapping_relation_by_file(source_rpm_mapping_file) + +# 处理映射关系 +for arch in src_src_map_rpm: + if arch == 'noarch': + continue + with open(os.path.join(res_path, src_to_binary_map_file + '_' + arch), 'w') as file: + noarch = 'noarch' + res_binary_mapping_binary = {} + if noarch in src_src_map_rpm: + # 源noarch -> 目标noarch + if noarch in des_src_map_rpm: + res_tmp = getBinaryMappingBinaryByArch(noarch, noarch, src_src_map_rpm, des_src_map_rpm, source_rpm_mapping_relation) + merge_mapping_content(res_tmp, res_binary_mapping_binary) + + ## 源noarch -> 目标arch + if arch in des_src_map_rpm: + res_tmp = getBinaryMappingBinaryByArch(noarch, arch, src_src_map_rpm, des_src_map_rpm, source_rpm_mapping_relation) + merge_mapping_content(res_tmp, res_binary_mapping_binary) + + ## 源arch -> 目标noarch + if noarch in des_src_map_rpm: + res_tmp = getBinaryMappingBinaryByArch(arch, noarch, src_src_map_rpm, des_src_map_rpm, source_rpm_mapping_relation) + res_binary_mapping_binary = merge_mapping_content(res_tmp, res_binary_mapping_binary) + + # 源arch -> 目标arch + if arch in des_src_map_rpm: + res_tmp = getBinaryMappingBinaryByArch(arch, arch, src_src_map_rpm, des_src_map_rpm, source_rpm_mapping_relation) + res_binary_mapping_binary = merge_mapping_content(res_tmp, res_binary_mapping_binary) + + if bool(res_binary_mapping_binary): + # 能够映射,输出到文件 + # 格式:源系统二进制包名1 : 目标系统二进制包名1 目标系统二进制包名2 ... + for binary_name in res_binary_mapping_binary: + file.write(binary_name + ' : ' + ' '.join(res_binary_mapping_binary[binary_name]) + '\n') + else : + print(f'[{arch}] not match ...') diff --git a/utils/rpm-package-mapping.py b/utils/rpm-package-mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..225dd7c2cff8dcd405d1c4141dbc6403dda929a5 --- /dev/null +++ b/utils/rpm-package-mapping.py @@ -0,0 +1,54 @@ +import difflib +import argparse +import Levenshtein # type: ignore + + +# 通过difflib.SequenceMatcher计算相似度 +def similarity(str1, str2): + matcher = difflib.SequenceMatcher(None, str1, str2) + return matcher.ratio() + +# 通过python-Levenshtein库的计算相似度 +def similarity_levenshtein(str1, str2): + edit_distance = distance(str1, str2) # type: ignore + max_len = max(len(str1), len(str2)) + return 1 - edit_distance/max_len + +def range_validator(min_value, max_value): + def validator(value): + fvalue = float(value) + if not min_value <= fvalue <= max_value: + raise argparse.ArgumentTypeError(f"值必须在 {min_value} 和 {max_value} 之间") + return fvalue + return validator + +if __name__ == '__main__': + parser = argparse.ArgumentParser(usage=""" + 包名映射 + """) + parser.add_argument('-s', type=str, required=True, + help="源包名") + parser.add_argument('-d', type=str, required=True, + help="待匹配的包名字符串,格式为\"package_name1 package_name2\",例如:\"389-ds-base 389-ds-base-devel\"") + parser.add_argument('-b', type=range_validator(0, 1), required=True, + help="最低相似度(浮点数值,范围:0~1),例如: 0.7") + + args = parser.parse_args() + print(args.__dict__) + src_str = str(args.s) + des_str = str(args.d) + similar_num_min = args.b + + # 拆分待匹配字符串 + des_strs = [str.strip() for str in des_str.split()] + + res_str_similar = {} + for str in des_strs: + similar_num = similarity(src_str, str) + if similar_num >= similar_num_min: + res_str_similar[str] = similar_num + + res_str_similar_sort = dict(sorted(res_str_similar.items(), key=lambda item: item[1], reverse=True)) + res_str_arr = [key for key in res_str_similar_sort] + print('result[' + " ".join(res_str_arr) + ']') + print(f'SIMILARITY-MATCH:[SUCCESS]-[{res_str_arr[0]}]')