Data Ingestion

Data Ingestion

An example illustrating the data ingestion in FreeDiscovery

from __future__ import print_function import requests import pandas as pd import json import os.path pd.options.display.float_format = '{:,.3f}'.format pd.options.display.expand_frame_repr = False dataset_name = "treclegal09_2k_subset" # see list of available datasets BASE_URL = "http://localhost:5001/api/v0" # FreeDiscovery server URL

0. Load the test dataset

url = BASE_URL + '/example-dataset/{}'.format(dataset_name) print(" GET", url) input_ds = requests.get(url).json() # To use a custom dataset, simply specify the following variables # create a custom dataset definition for ingestion data_dir = input_ds['metadata']['data_dir'] dataset_definition = [{'document_id': row['document_id'], 'file_path': os.path.join(data_dir, row['file_path'])} \ for row in input_ds['dataset']]

Out:

GET http://localhost:5001/api/v0/example-dataset/treclegal09_2k_subset

1.a Load dataset and initalize feature extraction

url = BASE_URL + '/feature-extraction' print(" POST", url) res = requests.post(url, json={'use_hashing': True}).json() dsid = res['id'] print(" => received {}".format(list(res.keys()))) print(" => dsid = {}".format(dsid)) print("\n1.b Start feature extraction") url = BASE_URL+'/feature-extraction/{}'.format(dsid) print(" POST", url) res = requests.post(url, json={'dataset_definition': dataset_definition})

Out:

POST http://localhost:5001/api/v0/feature-extraction
   => received ['id']
   => dsid = 0984d81158c844c5

1.b Start feature extraction
 POST http://localhost:5001/api/v0/feature-extraction/0984d81158c844c5

2 check the parameters of the extracted features

url = BASE_URL + '/feature-extraction/{}'.format(dsid) print(' GET', url) res = requests.get(url).json() print('\n'.join([' - {}: {}'.format(key, val) for key, val in res.items() if "filenames" not in key]))

Out:

GET http://localhost:5001/api/v0/feature-extraction/0984d81158c844c5
     - analyzer: word
     - chunk_size: 5000
     - data_dir: /home/ubuntu/freediscovery_shared/treclegal09_2k_subset/data/jobRun_4/XML_EXPORT_CONTENT/text_9
     - max_df: 1.0
     - min_df: 0.0
     - n_features: 100001
     - n_jobs: 1
     - n_samples: 2465
     - n_samples_processed: 2465
     - ngram_range: [1, 1]
     - norm_alpha: 0.75
     - parse_email_headers: False
     - preprocess: []
     - stop_words: english
     - use_hashing: True
     - weighting: nnc

3. Examine the id mapping

method = BASE_URL + "/feature-extraction/{}/id-mapping".format(dsid) print('\n GET', method) data = {'data': [{'internal_id': row['internal_id']} for row in input_ds['dataset'][:3]]} print(' DATA', json.dumps(data)) res = requests.post(method, json=data).json() print(' Response:') print(' ', json.dumps(res, indent=4))

Out:

GET http://localhost:5001/api/v0/feature-extraction/0984d81158c844c5/id-mapping
   DATA {"data": [{"internal_id": 0}, {"internal_id": 1}, {"internal_id": 2}]}
 Response:
   {
    "data": [
        {
            "document_id": 0,
            "file_path": "0.7.47.1097257.txt",
            "internal_id": 0
        },
        {
            "document_id": 1,
            "file_path": "0.7.47.1097258.txt",
            "internal_id": 1
        },
        {
            "document_id": 4,
            "file_path": "0.7.47.1097259.txt",
            "internal_id": 2
        }
    ]
}

4. Delete the extracted features

url = BASE_URL + '/feature-extraction/{}'.format(dsid) print(" DELETE", url) requests.delete(url)

Out:

DELETE http://localhost:5001/api/v0/feature-extraction/0984d81158c844c5

Total running time of the script: ( 0 minutes 1.862 seconds)