Examples¶
Some examples using the main class WDCClient.
import logging
import sys
import pandas as pd
import json
from dotenv import load_dotenv
import pytest
import http.client as http_client
from dsslab.wdc_client import *
from networkx.classes.function import get_node_attributes
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize environment with the variables WDC_HOST and WDC_TOKEN
load_dotenv()
def test_searchDomainsAsDataFrame_with_encodings():
# This is the preferred way to initialize a WDCClients.
# It uses the environment-variables WDC_HOST and WDC_TOKEN
# which previously have been loaded via load_dotenv.
# However, you are free to init the enviroment with the means you prefer.
client = WDCClient.fromEnv() # <1>
# NOTE: When copying this code you should OMIT the parameter "size"
# of the Query-String ot the URL.
# It is only set for testing the internal paging functionality.
df = client.loadAsDataFrame(
'api/snapshot/20121227_intermediaries/searchDomains',
{'query': 'uni OR uni', 'size': 25})
logger.info("dataFrame: %s", df)
assert len(df) == 110
def test_searchDomainsAsJSON():
client = WDCClient(host = "http://localhost:8888/",
token = "mytoken") # <2>
# NOTE: When copying this code you should OMIT the parameter "size"
# of the Query-String ot the URL.
# It is only set for testing the internal paging functionality.
json = client.loadAsJson(
'api/snapshot/20121227_intermediaries/searchDomains',
{'query': 'uni', 'size': 20})
#logger.info("json: %s", json)
assert len(json) == 110
def test_selectionSet():
"""
Creates a Selection with a Set of given Domains
"""
client = WDCClient.fromEnv()
domains = """
www.meine-domain.de
www.eine-andere.de
www.und-noch-eine-domain.de
"""
client.put("/api/selection/python-test-selection/set", domains)
domains = client.loadAsJson(
"/api/selection/python-test-selection/domains")
assert len(domains) == 3
for d in ["www.meine-domain.de", "www.eine-andere.de", "www.und-noch-eine-domain.de"]:
e = [x for x in domains if x["name"] == d]
assert len(e) == 1
def test_loadForEach():
client = WDCClient.fromEnv()
# Define a Callback-Function which is called on
# each object / row of the result. Must be a Function with
# the following signature:
# f(entry: Any, currentPos: int, maxPos: int)
count = 0
def f(e, pos, maxPos):
logger.info("Current:" + str(pos) + " of " + str(maxPos))
nonlocal count
count += 1
client.loadForEach(
'api/snapshot/20121227_intermediaries/searchDomains',
{'query': 'uni OR uni', 'size': 25},
f)
assert count == 110
def test_loadDomainGraph():
client = WDCClient.fromEnv()
graph = client.loadDomainGraph("20121227_intermediaries")
logger.info("graph: " + str(graph))
assert len(graph.nodes()) == 113
assert len(graph.edges()) > 1000
def test_loadDomainGraphFromDataFrames():
"""
Loads the Nodes and Edges as JSON-Array which are then
enriched using a DataFrame.
A real-world example might enrich nodes using an Excel-Sheet,
or a Database - at least something which can be interpreted as a
DataFrame.
"""
client = WDCClient.fromEnv()
nodes, edges = client.loadDomainGraphData("20121227_intermediaries")
nodesDF = pd.json_normalize(nodes)
nodesDF['new_col'] = range(0, len(nodes))
graph = client.createDomainGraph(nodesDF, edges)
assert len(graph.nodes()) == 113
assert len(graph.edges()) > 1000
values = get_node_attributes(graph, 'new_col')
assert len(values) > 0