使用子图和图采样投影进行节点回归

此 Jupyter Notebook 托管在这里的 Neo4j 图数据科学客户端 GitHub 仓库中。

若想观看该 Notebook 早期版本的视频演示，请参见在 NODES 2022 大会上进行的演讲 Neo4j 图数据科学基础系列 2.x – 流水线及更多。

本 Notebook 示例展示了如何使用节点回归流水线，同时包含了大量以下内容的示例：

便利对象
过滤图
图样本投影

它使用纯 Python 编写，以展示 GDS Python 客户端抽象 Cypher 查询的能力。

1. 数据集

我们的输入图表示的是维基百科上关于特定主题的页面以及它们之间的相互链接

变色龙
松鼠
鳄鱼

特征是页面文本中出现的某些信息性名词。目标是该页面的月均访问量。

该数据集首次发表在《Multi‑scale Attributed Node Embedding》一文中，作者 B. Rozemberczki、C. Allen 与 R. Sarkar，eprint 1909.13021。此处托管的版本取自 SNAP，日期为 2022‑11‑14。

2. 前置条件

要运行此流水线，必须拥有一个正在运行的 Neo4j DBMS，并安装了最新版本的 Neo4j 图数据科学插件。如果您已有可用的 AuraDS 实例，则已满足这些要求。

# First, we must install the GDS Python Client
%pip install graphdatascience

import os

# Then, we connect to our Neo4j DBMS hosting the Graph Data Science library
from graphdatascience import GraphDataScience

# Get Neo4j DB URI, credentials and name from environment if applicable
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://:7687")
NEO4J_AUTH = None
NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
if os.environ.get("NEO4J_USER") and os.environ.get("NEO4J_PASSWORD"):
    NEO4J_AUTH = (
        os.environ.get("NEO4J_USER"),
        os.environ.get("NEO4J_PASSWORD"),
    )
gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)

# Test our connection and print the Graph Data Science library version
print(gds.server_version())

from graphdatascience import ServerVersion

assert gds.server_version() >= ServerVersion(2, 5, 0)

# Importing the dataset

# The dataset is sourced from this GitHub repository
baseUrl = (
    "https://raw.githubusercontent.com/neo4j/graph-data-science-client/main/examples/datasets/wikipedia-animals-pages"
)

# Constraints to speed up importing
gds.run_cypher(
    """
    CREATE CONSTRAINT chameleons
    FOR (c:Chameleon)
    REQUIRE c.id IS NODE KEY
"""
)
gds.run_cypher(
    """
    CREATE CONSTRAINT crocodiles
    FOR (c:Crocodile)
    REQUIRE c.id IS NODE KEY
"""
)
gds.run_cypher(
    """
    CREATE CONSTRAINT squirrels
    FOR (s:Squirrel)
    REQUIRE s.id IS NODE KEY
"""
)

# Create nodes and relationships
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_edges.csv' AS row
    MERGE (c1:Chameleon {id: row.id1})
    MERGE (c2:Chameleon {id: row.id2})
    MERGE (c1)-[:LINK]->(c2)
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_edges.csv' AS row
    MERGE (c1:Crocodile {id: row.id1})
    MERGE (c2:Crocodile {id: row.id2})
    MERGE (c1)-[:LINK]->(c2)
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_edges.csv' AS row
    MERGE (s1:Squirrel {id: row.id1})
    MERGE (s2:Squirrel {id: row.id2})
    MERGE (s1)-[:LINK]->(s2)
""",
    {"baseUrl": baseUrl},
)

# Create target properties
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_target.csv' AS row
    MATCH (c:Chameleon {id: row.id})
    SET c.target = toInteger(row.target)
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_target.csv' AS row
    MATCH (c:Crocodile {id: row.id})
    SET c.target = toInteger(row.target)
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_target.csv' AS row
    MATCH (s:Squirrel {id: row.id})
    SET s.target = toInteger(row.target)
""",
    {"baseUrl": baseUrl},
)

# Create feature vectors
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_features.csv' AS row
    MATCH (c:Chameleon {id: row.id})
    WITH c, split(row.features, '|') AS features
    SET c.features = features
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_features.csv' AS row
    MATCH (c:Crocodile {id: row.id})
    WITH c, split(row.features, '|') AS features
    SET c.features = features
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_features.csv' AS row
    MATCH (c:Squirrel {id: row.id})
    WITH c, split(row.features, '|') AS features
    SET c.features = features
""",
    {"baseUrl": baseUrl},
)

3. 为流水线准备数据集

为了使用该数据集，我们必须将特征转换为模型支持且易于处理的格式。原始特征是特定词语的 ID，因而不适合作为线性回归的输入。

为了解决此问题，我们将采用独热编码。这会生成适合线性回归的特征。首先我们学习所有节点集合中名词的词典，然后创建一个用于存放词典的节点，随后利用它对所有特征向量进行独热编码。

# Construct one-hot dictionaries
gds.run_cypher(
    """
    MATCH (s:Chameleon)
    WITH s.features AS features
    UNWIND features AS feature
    WITH feature
      ORDER BY feature ASC
    WITH collect(distinct feature) AS orderedTotality
    CREATE (:Feature {animal: 'chameleon', totality: orderedTotality})
    RETURN orderedTotality
"""
)
gds.run_cypher(
    """
    MATCH (s:Crocodile)
    WITH s.features AS features
    UNWIND features AS feature
    WITH feature
      ORDER BY feature ASC
    WITH collect(distinct feature) AS orderedTotality
    CREATE (:Feature {animal: 'crocodile', totality: orderedTotality})
    RETURN orderedTotality
"""
)
gds.run_cypher(
    """
    MATCH (s:Squirrel)
    WITH s.features AS features
    UNWIND features AS feature
    WITH feature
      ORDER BY feature ASC
    WITH collect(distinct feature) AS orderedTotality
    CREATE (:Feature {animal: 'squirrel', totality: orderedTotality})
    RETURN orderedTotality
"""
)

# Do one-hot encoding
gds.run_cypher(
    """
    MATCH (f:Feature {animal: 'chameleon'})
    MATCH (c:Chameleon)
    SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
gds.run_cypher(
    """
    MATCH (f:Feature {animal: 'crocodile'})
    MATCH (c:Crocodile)
    SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
gds.run_cypher(
    """
    MATCH (f:Feature {animal: 'squirrel'})
    MATCH (c:Squirrel)
    SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)

# First, let's project our graph into the GDS Graph Catalog
# We will use a native projection to begin with
G_animals, projection_result = gds.graph.project(
    "wiki_animals",
    ["Chameleon", "Squirrel", "Crocodile"],
    {"LINK": {"orientation": "UNDIRECTED"}},
    nodeProperties=["features_one_hot", "target"],
)
print(projection_result[["graphName", "nodeCount", "relationshipCount"]])

4. 连通性

在图分析中，通常只对连通图进行操作。即图仅由单个 连通分量 组成。这样做的原因是，大多数情况下信息只能在有连边的地方流动。

确定图中分量数量的最快方法是使用 WCC（弱连通分量）算法。

# We use the WCC algorithm to see how many components we have
wcc_result = gds.wcc.mutate(G_animals, mutateProperty="wcc_component")

print(wcc_result[["computeMillis", "componentCount"]])

5. 分量划分

得知我们的图包含三个分量后，接下来会将这些分量分离为不同的子图。我们将使用 subgraph 投影来实现此目的，为每个分量创建一个子图。

# First, we stream the component ids
components = gds.graph.nodeProperty.stream(G_animals, "wcc_component")

# Second, we compute the unique component ids
component_ids = components["propertyValue"].unique()

# Third, we project a subgraph for each component
component_graphs = [
    gds.graph.filter(
        f"animals_component_{component_id}",
        G_animals,
        f"n.wcc_component = {component_id}",
        "*",
    )[0]
    for component_id in component_ids
]

# Lastly, we map the node labels in the graphs to the graph
graph_components_by_labels = {str(G_component.node_labels()): G_component for G_component in component_graphs}

print({k: v.name() for k, v in graph_components_by_labels.items()})

# Now, we are only interested in the Chameleon graph,
# so we will drop the other graphs and define a better variable for the one we keep
graph_components_by_labels[str(["Crocodile"])].drop()
graph_components_by_labels[str(["Squirrel"])].drop()
G_chameleon = graph_components_by_labels[str(["Chameleon"])]

# With the graph object G_chameleon, we can inspect some statistics
print("#nodes: " + str(G_chameleon.node_count()))
print("#relationships: " + str(G_chameleon.relationship_count()))
print("Degree distribution")
print("=" * 25)
print(G_chameleon.degree_distribution().sort_index())

6. 现在，让我们构建训练流水线！

我们将创建一个节点回归流水线，然后

配置划分方式
添加模型候选项
配置自动调参
添加节点属性步骤
选择模型特征

该流水线存放在流水线目录（Pipeline Catalog）中，我们通过 Pipeline 对象进行操作，以获得最大的便利性。

# Now, let's construct a training pipeline!
chameleons_nr_training = gds.nr_pipe("node_regression_pipeline__Chameleons")

# We configure the splitting
chameleons_nr_training.configureSplit(validationFolds=5, testFraction=0.2)

# We add a set of model candidates
# A linear regression model with the learningRate parameter in a search space
chameleons_nr_training.addLinearRegression(
    penalty=1e-5,
    patience=3,
    tolerance=1e-5,
    minEpochs=20,
    maxEpochs=500,
    learningRate={"range": [100, 1000]},  # We let the auto-tuner find a good value
)
# Let's try a few different models
chameleons_nr_training.configureAutoTuning(maxTrials=10)

# Our input feature dimension is 3132
# We can reduce the dimension to speed up training using a FastRP node embedding
chameleons_nr_training.addNodeProperty(
    "fastRP",
    embeddingDimension=256,
    propertyRatio=0.8,
    featureProperties=["features_one_hot"],
    mutateProperty="frp_embedding",
    randomSeed=420,
)

# And finally we select what features the model should be using
# We rely on the FastRP embedding solely, because it encapsulates the one-hot encoded source features
chameleons_nr_training.selectFeatures("frp_embedding")

# The training pipeline is now fully configured and ready to be run!

# We use the training pipeline to train a model
nc_model, train_result = chameleons_nr_training.train(
    G_chameleon,  # First, we use the entire Chameleon graph
    modelName="chameleon_nr_model",
    targetNodeLabels=["Chameleon"],
    targetProperty="target",
    metrics=["MEAN_SQUARED_ERROR", "MEAN_ABSOLUTE_ERROR"],
    randomSeed=420,
)

print("Winning model parameters: \n\t\t" + str(train_result["modelInfo"]["bestParameters"]))
print()
print("MEAN_SQUARED_ERROR      test score: " + str(train_result["modelInfo"]["metrics"]["MEAN_SQUARED_ERROR"]["test"]))
print("MEAN_ABSOLUTE_ERROR     test score: " + str(train_result["modelInfo"]["metrics"]["MEAN_ABSOLUTE_ERROR"]["test"]))

# Let's sample the graph to see if we can get a similarly good model
G_chameleon_sample, _ = gds.alpha.graph.sample.rwr(
    "cham_sample",
    G_chameleon,
    samplingRatio=0.30,  # We'll use 30% of the graph
)

# Now we can use the same training pipeline to train another model, but faster!
nc_model_sample, train_result_sample = chameleons_nr_training.train(
    G_chameleon_sample,
    modelName="chameleon_nr_model_sample",
    targetNodeLabels=["Chameleon"],
    targetProperty="target",
    metrics=["MEAN_SQUARED_ERROR", "MEAN_ABSOLUTE_ERROR"],
    randomSeed=420,
)

print("Winning model parameters: \n\t\t" + str(train_result_sample["modelInfo"]["bestParameters"]))
print()
print(
    "MEAN_SQUARED_ERROR      test score: "
    + str(train_result_sample["modelInfo"]["metrics"]["MEAN_SQUARED_ERROR"]["test"])
)
print(
    "MEAN_ABSOLUTE_ERROR     test score: "
    + str(train_result_sample["modelInfo"]["metrics"]["MEAN_ABSOLUTE_ERROR"]["test"])
)

# Let's see what our models predict

# The speed-trained model on 24% training data (30% sample - 20% test set)
predicted_targets_sample = nc_model_sample.predict_stream(G_chameleon)
# The fully trained model on 80% training data (20% test set)
predicted_targets_full = nc_model.predict_stream(G_chameleon)

# The original training data for comparison
real_targets = gds.graph.nodeProperty.stream(G_chameleon, "target")

# Merging the data frames
merged_full = real_targets.merge(predicted_targets_full, left_on="nodeId", right_on="nodeId")
merged_all = merged_full.merge(predicted_targets_sample, left_on="nodeId", right_on="nodeId")

# Look at the last 10 rows
print(merged_all.tail(10))