Examples

Some examples using the main class WDCClient.

import logging
import sys
import pandas as pd
import json
from dotenv import load_dotenv
import pytest
import http.client as http_client

from dsslab.wdc_client import *
  
from networkx.classes.function import get_node_attributes

logging.basicConfig(stream=sys.stderr, level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize environment with the variables WDC_HOST and WDC_TOKEN
load_dotenv()

def test_searchDomainsAsDataFrame_with_encodings():
    # This is the preferred way to initialize a WDCClients. 
    # It uses the environment-variables WDC_HOST and WDC_TOKEN
    # which previously have been loaded via load_dotenv. 
    # However, you are free to init the enviroment with the means you prefer.
    client = WDCClient.fromEnv() # <1>
    
    # NOTE: When copying this code you should OMIT the parameter "size"
    # of the Query-String ot the URL. 
    # It is only set for testing the internal paging functionality. 
    df = client.loadAsDataFrame(
        'api/snapshot/20121227_intermediaries/searchDomains', 
        {'query': 'uni OR uni', 'size': 25})

    logger.info("dataFrame: %s", df)
    
    assert len(df) == 110
    
def test_searchDomainsAsJSON(): 
    client = WDCClient(host = "http://localhost:8888/", 
        token = "mytoken") # <2>
    
    # NOTE: When copying this code you should OMIT the parameter "size"
    # of the Query-String ot the URL. 
    # It is only set for testing the internal paging functionality.
    json = client.loadAsJson(
        'api/snapshot/20121227_intermediaries/searchDomains',
        {'query': 'uni', 'size': 20})

    #logger.info("json: %s", json)
    
    assert len(json) == 110
    
def test_selectionSet():
    """
    Creates a Selection with a Set of given Domains
    """
    client = WDCClient.fromEnv()
    
    domains = """
    www.meine-domain.de
    www.eine-andere.de
    www.und-noch-eine-domain.de
    """
    
    client.put("/api/selection/python-test-selection/set", domains)
    
    domains = client.loadAsJson(
        "/api/selection/python-test-selection/domains")
    
    assert len(domains) == 3
   
    for d in ["www.meine-domain.de", "www.eine-andere.de", "www.und-noch-eine-domain.de"]:
        e = [x for x in domains if x["name"] == d]
        assert len(e) == 1
        
def test_loadForEach():
    client = WDCClient.fromEnv()
    
    # Define a Callback-Function which is called on 
    # each object / row of the result. Must be a Function with 
    # the following signature: 
    # f(entry: Any, currentPos: int, maxPos: int)
    count = 0 
    def f(e, pos, maxPos):
        logger.info("Current:" + str(pos) + " of " + str(maxPos))
        nonlocal count 
        count += 1
    
    client.loadForEach(
        'api/snapshot/20121227_intermediaries/searchDomains', 
        {'query': 'uni OR uni', 'size': 25}, 
        f)
    
    assert count == 110
    
def test_loadDomainGraph():
    client = WDCClient.fromEnv()
     
    graph = client.loadDomainGraph("20121227_intermediaries")
    
    logger.info("graph: " + str(graph))
    
    assert len(graph.nodes()) == 113
    assert len(graph.edges()) > 1000
    
def test_loadDomainGraphFromDataFrames():
    """
    Loads the Nodes and Edges as JSON-Array which are then 
    enriched using a DataFrame. 
    A real-world example might enrich nodes using an Excel-Sheet, 
    or a Database - at least something which can be interpreted as a 
    DataFrame.
    """
    client = WDCClient.fromEnv()
    
    nodes, edges = client.loadDomainGraphData("20121227_intermediaries")
    
    nodesDF = pd.json_normalize(nodes)
    nodesDF['new_col'] = range(0, len(nodes))
    
    graph = client.createDomainGraph(nodesDF, edges)
    
    assert len(graph.nodes()) == 113
    assert len(graph.edges()) > 1000
    
    values = get_node_attributes(graph, 'new_col')
    assert len(values) > 0