基于“网络嵌入学习方法Node2Vec”特征提取的方法对“DM、NM、WDNM”进行特征提取。并生成这种类加权图且将边的权重、方向、节点强度等标记出来。
步骤一:环境准备
1.软件系统数据获取
首先,需要确保已经正确下载并解压了所有指定版本的Java开源软件系统:
# 创建项目目录并下载软件系统
mkdir -p java_software_systems
cd java_software_systems
# Apache Ant 1.6.1
wget https://archive.apache.org/dist/ant/binaries/ant-1.6.1-bin.tar.gz
tar -xzf ant-1.6.1-bin.tar.gz
# Hibernate 5.2.12
wget https://sourceforge.net/projects/hibernate/files/hibernate5/5.2.12.Final/hibernate-release-5.2.12.Final.tar.gz
tar -xzf hibernate-release-5.2.12.Final.tar.gz
# jEdit 5.1.0
wget https://sourceforge.net/projects/jedit/files/jEdit%205.1.0/jedit-5.1.0.tar.gz/download -O jedit-5.1.0.tar.gz
tar -xzf jedit-5.1.0.tar.gz
# JGAP 3.6.3
wget https://sourceforge.net/projects/jgap/files/jgap/3.6.3/jgap-3.6.3.zip
unzip jgap-3.6.3.zip
# JHotDraw 6.0b.1
wget https://sourceforge.net/projects/jhotdraw/files/jHotDraw%206.0b.1/jhotdraw-6.0b.1.tar.gz/download -O jhotdraw-6.0b.1.tar.gz
tar -xzf jhotdraw-6.0b.1.tar.gz
# JMeter 2.0.1
wget https://archive.apache.org/dist/jmeter/binaries/jmeter-2.0.1.zip
unzip jmeter-2.0.1.zip
# Log4j 2.10.0
wget https://archive.apache.org/dist/logging/log4j/2.10.0/apache-log4j-2.10.0-bin.zip
unzip apache-log4j-2.10.0-bin.zip
# Wro4J 1.6.3
wget https://sourceforge.net/projects/wro4j/files/wro4j-1.6.3/wro4j-1.6.3.tar.gz/download -O wro4j-1.6.3.tar.gz
tar -xzf wro4j-1.6.3.tar.gz
2.安装依赖库
pip install networkx numpy scikit-learn node2vec xmltodict pydot javalang
3.下载并安装 DependencyFinder
wget https://sourceforge.net/projects/depfind/files/latest/download -O DependencyFinder.jar
步骤二:解析 Java 项目生成依赖网络
编写解析脚本
# parse_java_project.py
import os
import xmltodict
import networkx as nx
import javalang
def parse_dependency(xml_path):
with open(xml_path, 'r') as f:
xml = xmltodict.parse(f.read())
dependencies = []
for dep in xml['Dependencies']['Dependency']:
src = dep['@src']
tgt = dep['@tgt']
type_ = dep['@type']
count = int(dep['@count']) if '@count' in dep else 1
dependencies.append((src, tgt, {'type': type_, 'weight': count}))
return dependencies
def generate_graph(project_dir):
os.system(f"java -jar DependencyFinder.jar {project_dir} > temp.xml")
deps = parse_dependency("temp.xml")
G = nx.DiGraph()
G.add_edges_from(deps)
return G
def extract_class_metrics(java_source_dir):
metrics = []
for root, _, files in os.walk(java_source_dir):
for file in files:
if file.endswith(".java"):
with open(os.path.join(root, file), "r") as f:
code = f.read()
try:
tree = javalang.parse.parse(code)
class_name = list(tree.types)[0].name
num_methods = len(list(tree.types)[0].methods)
num_attributes = len(list(tree.types)[0].fields)
metrics.append({
"class": class_name,
"NumMethods": num_methods,
"NumAttributes": num_attributes
})
except Exception as e:
print(f"Error parsing {file}: {e}")
return pd.DataFrame(metrics)
# 示例调用
G = generate_graph("/path/to/Ant-1.6.1")
dm_df = extract_class_metrics("/path/to/Ant-1.6.1/src/main/java")
步骤三:Node2Vec 嵌入学习
训练嵌入
from node2vec import Node2Vec
# 假设 G 已由 generate_graph 生成
node2vec = Node2Vec(G, dimensions=32, walk_length=30, num_walks=200, p=0.25, q=2)
model = node2vec.fit(window=10, min_count=1)
embeddings = model.wv # 节点嵌入向量
步骤四:特征提取(DM、NM、WDNM)
DM
import pandas as pd
def calculate_dm(dm_df):
return dm_df.set_index("class")
dm_df = calculate_dm(dm_df)
NM
def calculate_nm(G):
nm = {}
for node in G.nodes:
degree = G.degree(node)
betweenness = nx.betweenness_centrality(G)[node]
eigenvector = nx.eigenvector_centrality(G)[node]
nm[node] = [degree, betweenness, eigenvector]
return pd.DataFrame(nm).T
nm_df = calculate_nm(G)
WDNM
def calculate_wdnm(G):
wdnm = {}
for node in G.nodes:
weighted_degree = sum(data['weight'] for _, _, data in G.out_edges(node, data=True))
weighted_betweenness = nx.betweenness_centrality(G, weight='weight')[node]
wdnm[node] = [weighted_degree, weighted_betweenness]
return pd.DataFrame(wdnm).T
wdnm_df = calculate_wdnm(G)
合并特征
features = pd.concat([dm_df, nm_df, wdnm_df], axis=1)
步骤五:定义节点强度
补充节点强度
import numpy as np
def calculate_node_strength(model, G):
node_strength = {}
for node in G.nodes:
if node in model.wv:
# 使用 Node2Vec 嵌入向量的 L2 范数
strength = np.linalg.norm(model.wv[node])
node_strength[node] = strength
else:
node_strength[node] = 0 # 未出现在嵌入中的节点
return node_strength
def calculate_pagerank(G):
return nx.pagerank(G, weight='weight')
# 计算节点强度
node_strength = calculate_node_strength(model, G)
page_rank = calculate_pagerank(G)
步骤六:生成加权图示例
可视化代码
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
def plot_weighted_graph(G, node_strength, page_rank, nodes_to_highlight=None):
pos = graphviz_layout(G, prog='dot') # 使用 GraphViz 的 dot 布局优化方向
edge_labels = {(u, v): f"{d['weight']}" for u, v, d in G.edges(data=True)}
# 节点大小和颜色映射为强度
node_sizes = [node_strength[node] * 100 for node in G.nodes]
node_colors = [page_rank[node] for node in G.nodes]
nx.draw_networkx(
G, pos,
node_size=node_sizes,
node_color=node_colors,
cmap=plt.cm.viridis,
with_labels=True,
arrows=True, # 显示边方向
arrowstyle='->',
arrowsize=10
)
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.title("加权有向图(方向、权重、节点强度)")
plt.show()
# 示例:绘制 Ant 项目的关键子图
subgraph = G.subgraph(['org.apache.tools.ant.Main', 'org.apache.tools.ant.Project'])
plot_weighted_graph(subgraph, node_strength, page_rank)
完整流程示例
# 1. 解析项目
python parse_java_project.py /path/to/Ant-1.6.1
# 2. Node2Vec 训练
python -c """
import networkx as nx
from node2vec import Node2Vec
G = nx.read_gpickle('Ant-1.6.1.pkl') # 假设已保存图结构
node2vec = Node2Vec(G, dimensions=32, walk_length=30, num_walks=200, p=0.25, q=2)
model = node2vec.fit(window=10, min_count=1)
model.wv.save_word2vec_format('ant_embeddings.txt')
"""
# 3. 特征提取与合并
python -c """
import pandas as pd
# 加载各特征DataFrame后合并
final_features = pd.concat([dm_df, nm_df, wdnm_df], axis=1)
final_features.to_csv('ant_features.csv')
"""
# 4. 可视化
python plot_weighted_graph.py --input Ant-1.6.1.pkl --output ant_graph.png
注意事项
- 路径替换:需将
/path/to/Ant-1.6.1
替换为实际项目路径。 - 标签定义:需根据实际需求定义关键类标签(例如通过规则或人工标注)。
- 性能优化:大规模项目需增加内存分配或分块处理。
- GraphViz 安装:确保已安装 GraphViz 工具(
sudo apt-get install graphviz
)。
Comments NOTHING