Semantic Search
Semantic Search
An example of Semantic Search
from __future__ import print_functionimport os.pathimport requestsimport pandas as pdpd.options.display.float_format = '{:,.3f}'.formatpd.options.display.expand_frame_repr = Falsedataset_name = "treclegal09_2k_subset" # see list of available datasetsBASE_URL = "http://localhost:5001/api/v0" # FreeDiscovery server URL
0. Load the test dataset
url = BASE_URL + '/example-dataset/{}'.format(dataset_name)print(" GET", url)input_ds = requests.get(url).json()# create a custom dataset definition for ingestiondata_dir = input_dsdataset_definition = [{'document_id': row, 'file_path': os.path.join(data_dir, row)} for row in input_ds['dataset']]
Out:
GET http://localhost:5001/api/v0/example-dataset/treclegal09_2k_subset
1. Feature extraction
1.a Load dataset and initalize feature extraction
url = BASE_URL + '/feature-extraction'print(" POST", url)res = requests.post(url).json()dsid = res['id']print(" => received {}".format(list(res.keys())))print(" => dsid = {}".format(dsid))
Out:
POST http://localhost:5001/api/v0/feature-extraction => received ['id'] => dsid = 0038734341cb4413
1.b Start feature extraction
url = BASE_URL+'/feature-extraction/{}'.format(dsid)print(" POST", url)requests.post(url, json={'dataset_definition': dataset_definition})
Out:
POST http://localhost:5001/api/v0/feature-extraction/0038734341cb4413
2. Calculate LSI
(used for Nearest Neighbors method)
url = BASE_URL + '/lsi/'print("POST", url)n_components = 100res = requests.post(url, json={'n_components': n_components, 'parent_id': dsid }).json()lsi_id = res['id']print(' => LSI model id = {}'.format(lsi_id))print((" => SVD decomposition with {} dimensions explaining " "{:.2f} % variabilty of the data") .format(n_components, res*100))
Out:
POST http://localhost:5001/api/v0/lsi/ => LSI model id = 56820e81350a4a89 => SVD decomposition with 100 dimensions explaining 69.79 % variabilty of the data
3. Semantic search
print("\n3.a. Perform the semantic search")query = ("There are some conflicts with the draft date, so we will probably " "need to have it on a different date.")url = BASE_URL + '/search/'print(" POST", url)res = requests.post(url, json={'parent_id': lsi_id, 'query': query }).json()data = resdf = pd.DataFrame(data).set_index('document_id')print(df)print(df.score.max())
Out:
3.a. Perform the semantic search POST http://localhost:5001/api/v0/search/ score document_id 5267025 0.585 5262436 0.583 5148361 0.516 115600 0.516 116964 0.515 5157441 0.515 116281 0.463 5152900 0.463 5067001 0.348 97969 0.348 73984 0.338 54756 0.338 4813636 0.338 4950625 0.338 75625 0.338 53361 0.333 4800481 0.333 2500 0.319 4251844 0.319 6724 0.309 6889 0.300 4363921 0.293 3055504 0.292 43264 0.288 3728761 0.287 1265625 0.287 1004004 0.283 3189796 0.275 3625216 0.273 1210000 0.273 ... ... 280900 -0.172 1600 -0.175 872356 -0.176 1263376 -0.181 1750329 -0.182 1838736 -0.183 1062961 -0.184 2268036 -0.186 249001 -0.187 605284 -0.188 3511876 -0.188 956484 -0.191 263169 -0.193 244036 -0.194 3671056 -0.198 3701776 -0.198 751689 -0.198 1656369 -0.198 3268864 -0.199 1766241 -0.208 3694084 -0.234 3125824 -0.237 3139984 -0.248 3118756 -0.250 3132900 -0.251 193600 -0.254 3122289 -0.255 3115225 -0.256 3136441 -0.258 3129361 -0.258 [2465 rows x 1 columns] 0.5849642792577727
Delete the extracted features
url = BASE_URL + '/feature-extraction/{}'.format(dsid)requests.delete(url)
Total running time of the script: ( 0 minutes 3.373 seconds)