Sei sulla pagina 1di 3

import logging.

config
from elasticsearch import Elasticsearch
from neo4j import GraphDatabase
import re
import json
from uuid import uuid4
from elasticsearch import helpers

def get_es_client():
port = 9200
hosts = "10.1.129.78"
user = "yury"
password = "bnfjby_130688"
http_scheme = "https"
if http_scheme.lower() == 'https':
es = Elasticsearch(hosts, http_auth=(user, password), scheme=http_scheme,
port=port,
ca_certs=False, verify_certs=False, use_ssl=True)
else:
es = Elasticsearch(hosts, http_auth=(user, password), scheme=http_scheme,
port=port)
return es

def get_neo4j_client():
host = "bolt://10.1.129.78:7687"
user = "neo4j"
password = "neo"
client = GraphDatabase.driver(host, auth=(user, password))
return client

def create_doc_entities_from_to(neo, doc, text):


from_person_email_regex = r"From: \"?([a-zA-Z]+[',. -\_][a-zA-Z ]?[a-zA-
Z]*)*.*(mailto:([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+).*)"
from_person_regex = r"^From: \"?([a-zA-Z ]+[',. -\_]?[a-zA-Z ]?[a-zA-Z.]*).*$"
to_person_email_regex = r"To: \"?([a-zA-Z]+[',. -\_][a-zA-Z ]?[a-zA-
Z]*)*.*(mailto:([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+).*)"
to_person_regex = r"^To: \"?([a-zA-Z ]+[',. -\_]?[a-zA-Z ]?[a-zA-Z.]*).*$"
email_people_found_from = dict()
email_people_found_to = dict()
match_from_person_email = re.findall(from_person_email_regex, text, re.M |
re.I)
if match_from_person_email is not None:
for g in match_from_person_email:
email_people_found_from[g[0]] = g[2]
match_from_email = re.findall(from_person_regex, text, re.M | re.I)
if match_from_email is not None:
for g in match_from_email:
if g not in email_people_found_from:
email_people_found_from[g] = None
match_to_person_email = re.findall(to_person_email_regex, text, re.M)
if match_to_person_email is not None:
for g in match_to_person_email:
email_people_found_to[g[0]] = g[2]
match_to_email = re.findall(to_person_regex, text, re.M)
if match_to_email is not None:
for g in match_to_email:
if g not in email_people_found_to:
email_people_found_to[g] = None
with neo.session() as session:
#create document node
#session.write_transaction(_create_doc_node, doc)
#session.write_transaction(_create_from_to_nodes, )

pass

def _create_doc_node(tx, doc):


logstash_checksum = "logstash_checksum"
name = "filename"
result = tx.run("CREATE (a:Email) "
"SET a.logstash_checksum = $logstash_checksum,"
"a.name = $name "
"RETURN id(a)", logstash_checksum=doc[logstash_checksum],
name=doc[name])
return result.single()[0]

def _create_from_to_nodes(tx, from_list, to_list):


email = "email"
name = "name"
for f in from_list:
tx.run("CREATE (a:From) SET a.email = $email, a.name=$name RETURN id(a)",
email=f[email], name=f[name])
for t in to_list:
tx.run("CREATE (a:To) SET a.email = $email, a.name=$name RETURN id(a)",
email=t[email], name=t[name])

es_index = "email-dnc-entities"
es_doc_type = "DNC"
scroll_timeout = '10m'
text_field = 'email.body'
emails_field = 'entities.identifier:email'
people_field = 'entities.person'
organizations_field = 'entities.organization'
query_body = {"query": {"bool": {"must": [{"exists": {"field": text_field}}]}}}

es = get_es_client()
neo = get_neo4j_client()
page = es.search(
index=es_index,
doc_type=es_doc_type,
scroll=scroll_timeout,
size=1000,
body=query_body)

sid = page['_scroll_id']
scroll_size = len(page['hits']['hits'])

# Start scrolling
total_docs = 0
while scroll_size > 0:
total_docs += scroll_size
for i in range(0, scroll_size):
doc = page['hits']['hits'][i]['_source']
create_doc_entities_from_to(neo, doc, doc["email"]["body"][0])
# if ('processed' in doc) and ('faces_search' in doc['processed']) \
# and ('embedding_id' in doc['processed']['faces_search'][0]):
# face_id_list = doc['processed']['faces_search'][0][vector_key]
# if len(face_id_list) > 0:
# for face_id in face_id_list:
# res = self.tree_service.get_by_id(clustering_container_name,
face_id)
# if res is not None:
# face_ids.append(res[0])
# ids.append(page['hits']['hits'][i]['_id'])
print(doc)

page = es.scroll(scroll_id=sid, scroll=scroll_timeout)


# Update the scroll ID
sid = page['_scroll_id']
# Get the number of results that we returned in the last scroll
scroll_size = len(page['hits']['hits'])

Potrebbero piacerti anche