Documenti di Didattica
Documenti di Professioni
Documenti di Cultura
config
from elasticsearch import Elasticsearch
from neo4j import GraphDatabase
import re
import json
from uuid import uuid4
from elasticsearch import helpers
def get_es_client():
port = 9200
hosts = "10.1.129.78"
user = "yury"
password = "bnfjby_130688"
http_scheme = "https"
if http_scheme.lower() == 'https':
es = Elasticsearch(hosts, http_auth=(user, password), scheme=http_scheme,
port=port,
ca_certs=False, verify_certs=False, use_ssl=True)
else:
es = Elasticsearch(hosts, http_auth=(user, password), scheme=http_scheme,
port=port)
return es
def get_neo4j_client():
host = "bolt://10.1.129.78:7687"
user = "neo4j"
password = "neo"
client = GraphDatabase.driver(host, auth=(user, password))
return client
pass
es_index = "email-dnc-entities"
es_doc_type = "DNC"
scroll_timeout = '10m'
text_field = 'email.body'
emails_field = 'entities.identifier:email'
people_field = 'entities.person'
organizations_field = 'entities.organization'
query_body = {"query": {"bool": {"must": [{"exists": {"field": text_field}}]}}}
es = get_es_client()
neo = get_neo4j_client()
page = es.search(
index=es_index,
doc_type=es_doc_type,
scroll=scroll_timeout,
size=1000,
body=query_body)
sid = page['_scroll_id']
scroll_size = len(page['hits']['hits'])
# Start scrolling
total_docs = 0
while scroll_size > 0:
total_docs += scroll_size
for i in range(0, scroll_size):
doc = page['hits']['hits'][i]['_source']
create_doc_entities_from_to(neo, doc, doc["email"]["body"][0])
# if ('processed' in doc) and ('faces_search' in doc['processed']) \
# and ('embedding_id' in doc['processed']['faces_search'][0]):
# face_id_list = doc['processed']['faces_search'][0][vector_key]
# if len(face_id_list) > 0:
# for face_id in face_id_list:
# res = self.tree_service.get_by_id(clustering_container_name,
face_id)
# if res is not None:
# face_ids.append(res[0])
# ids.append(page['hits']['hits'][i]['_id'])
print(doc)