Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Creating custom (faster) implementations of the statistics query #738

Merged
merged 2 commits into from
Oct 2, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions aiida/backends/djsite/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,52 @@ def query_jobcalculations_by_computer_user_state(
return queryresults[:limit]
else:
return queryresults

def get_creation_statistics(
self,
user_email=None
):
"""
Return a dictionary with the statistics of node creation, summarized by day,
optimized for the Django backend.

:note: Days when no nodes were created are not present in the returned `ctime_by_day` dictionary.

:param user_email: If None (default), return statistics for all users.
If an email is specified, return only the statistics for the given user.

:return: a dictionary as
follows::

{
"total": TOTAL_NUM_OF_NODES,
"types": {TYPESTRING1: count, TYPESTRING2: count, ...},
"ctime_by_day": {'YYYY-MMM-DD': count, ...}

where in `ctime_by_day` the key is a string in the format 'YYYY-MM-DD' and the value is
an integer with the number of nodes created that day.
"""
import sqlalchemy as sa
from aiida.backends.djsite.querybuilder_django import dummy_model

# Get the session (uses internally aldjemy - so, sqlalchemy) also for the Djsite backend
s = dummy_model.get_aldjemy_session()

retdict = {}

# Total number of nodes
retdict["total"] = s.query(dummy_model.DbNode).count()

# Nodes per type
retdict["types"] = dict(s.query(dummy_model.DbNode.type.label('typestring'),
sa.func.count(dummy_model.DbNode.id)).group_by('typestring').all())

# Nodes created per day
stat = s.query(sa.func.date_trunc('day', dummy_model.DbNode.ctime).label('cday'),
sa.func.count(dummy_model.DbNode.id)).group_by('cday').order_by('cday').all()

ctime_by_day = {_[0].strftime('%Y-%m-%d'): _[1] for _ in stat}
retdict["ctime_by_day"] = ctime_by_day

return retdict
# Still not containing all dates
82 changes: 82 additions & 0 deletions aiida/backends/general/abstractqueries.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class AbstractQueryManager(object):

def __init__(self, *args, **kwargs):
pass


# This is an example of a query that could be overriden by a better implementation,
# for performance reasons:
def query_jobcalculations_by_computer_user_state(
Expand Down Expand Up @@ -106,3 +108,83 @@ def query_jobcalculations_by_computer_user_state(
returnresult = zip(*returnresult)[0]
return returnresult


def get_creation_statistics(
self,
user_email=None
):
"""
Return a dictionary with the statistics of node creation, summarized by day.

:note: Days when no nodes were created are not present in the returned `ctime_by_day` dictionary.

:param user_email: If None (default), return statistics for all users.
If an email is specified, return only the statistics for the given user.

:return: a dictionary as
follows::

{
"total": TOTAL_NUM_OF_NODES,
"types": {TYPESTRING1: count, TYPESTRING2: count, ...},
"ctime_by_day": {'YYYY-MMM-DD': count, ...}

where in `ctime_by_day` the key is a string in the format 'YYYY-MM-DD' and the value is
an integer with the number of nodes created that day.
"""
from aiida.orm.querybuilder import QueryBuilder as QB
from aiida.orm import User, Node
from collections import Counter
import datetime

def count_statistics(dataset):

def get_statistics_dict(dataset):
results = {}
for count, typestring in sorted(
(v, k) for k, v in dataset.iteritems())[::-1]:
results[typestring] = count
return results

count_dict = {}

types = Counter([r[2] for r in dataset])
count_dict["types"] = get_statistics_dict(types)

ctimelist = [r[1].strftime("%Y-%m-%d") for r in dataset]
ctime = Counter(ctimelist)

if len(ctimelist) > 0:

# For the way the string is formatted, we can just sort it alphabetically
firstdate = datetime.datetime.strptime(sorted(ctimelist)[0], '%Y-%m-%d')
lastdate = datetime.datetime.strptime(sorted(ctimelist)[-1], '%Y-%m-%d')

curdate = firstdate
outdata = {}

while curdate <= lastdate:
curdatestring = curdate.strftime('%Y-%m-%d')
outdata[curdatestring] = ctime.get(curdatestring, 0)
curdate += datetime.timedelta(days=1)
count_dict["ctime_by_day"] = outdata

else:
count_dict["ctime_by_day"] = {}

return count_dict

statistics = {}

q = QB()
q.append(Node, project=['id', 'ctime', 'type'], tag='node')
if user_email is not None:
q.append(User, creator_of='node', project='email', filters={'email': user_email})
qb_res = q.all()

# total count
statistics["total"] = len(qb_res)
statistics.update(count_statistics(qb_res))

return statistics

55 changes: 54 additions & 1 deletion aiida/backends/sqlalchemy/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,58 @@


class QueryManagerSQLA(AbstractQueryManager):
pass
"""
SQLAlchemy implementation of custom queries, for efficiency reasons
"""

def get_creation_statistics(
self,
user_email=None
):
"""
Return a dictionary with the statistics of node creation, summarized by day,
optimized for the Django backend.

:note: Days when no nodes were created are not present in the returned `ctime_by_day` dictionary.

:param user_email: If None (default), return statistics for all users.
If an email is specified, return only the statistics for the given user.

:return: a dictionary as
follows::

{
"total": TOTAL_NUM_OF_NODES,
"types": {TYPESTRING1: count, TYPESTRING2: count, ...},
"ctime_by_day": {'YYYY-MMM-DD': count, ...}

where in `ctime_by_day` the key is a string in the format 'YYYY-MM-DD' and the value is
an integer with the number of nodes created that day.
"""
import sqlalchemy as sa
import aiida.backends.sqlalchemy
from aiida.backends.sqlalchemy import models as m

# Get the session (uses internally aldjemy - so, sqlalchemy) also for the Djsite backend
s = aiida.backends.sqlalchemy.get_scoped_session()

retdict = {}

# Total number of nodes
retdict["total"] = s.query(m.node.DbNode).count()

# Nodes per type
retdict["types"] = dict(s.query(m.node.DbNode.type.label('typestring'),
sa.func.count(m.node.DbNode.id)).group_by('typestring').all())

# Nodes created per day
stat = s.query(sa.func.date_trunc('day', m.node.DbNode.ctime).label('cday'),
sa.func.count(m.node.DbNode.id)).group_by('cday').order_by('cday').all()

ctime_by_day = {_[0].strftime('%Y-%m-%d'): _[1] for _ in stat}
retdict["ctime_by_day"] = ctime_by_day

return retdict
# Still not containing all dates


94 changes: 94 additions & 0 deletions aiida/backends/tests/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,5 +861,99 @@ def test_create_node_and_query(self):
self.assertEqual(idx,99)
self.assertTrue(len(QueryBuilder().append(Node,project=['id','label']).all(batch_size=10)) > 99)

class TestStatisticsQuery(AiidaTestCase):
def test_statistics(self):
"""
Test if the statistics query works properly.

I try to implement it in a way that does not depend on the past state.
"""
from aiida.backends.utils import QueryFactory
from aiida.orm import Node, DataFactory, Calculation
from collections import defaultdict

def store_and_add(n, statistics):
n.store()
statistics['total'] += 1
statistics['types'][n._plugin_type_string] += 1
statistics['ctime_by_day'][n.ctime.strftime('%Y-%m-%d')] += 1

qmanager = QueryFactory()()
current_db_statistics = qmanager.get_creation_statistics()
types = defaultdict(int)
types.update(current_db_statistics['types'])
ctime_by_day = defaultdict(int)
ctime_by_day.update(current_db_statistics['ctime_by_day'])

expected_db_statistics = {
'total': current_db_statistics['total'],
'types': types,
'ctime_by_day': ctime_by_day
}

ParameterData = DataFactory('parameter')

store_and_add(Node(), expected_db_statistics)
store_and_add(ParameterData(), expected_db_statistics)
store_and_add(ParameterData(), expected_db_statistics)
store_and_add(Calculation(), expected_db_statistics)

new_db_statistics = qmanager.get_creation_statistics()
# I only check a few fields
new_db_statistics = {k: v for k, v in new_db_statistics.iteritems() if k in expected_db_statistics}

expected_db_statistics = {k: dict(v) if isinstance(v, defaultdict) else v
for k, v in expected_db_statistics.iteritems()}

self.assertEquals(new_db_statistics, expected_db_statistics)


def test_statistics_default_class(self):
"""
Test if the statistics query works properly.

I try to implement it in a way that does not depend on the past state.
"""
from aiida.orm import Node, DataFactory, Calculation
from collections import defaultdict
from aiida.backends.general.abstractqueries import AbstractQueryManager

def store_and_add(n, statistics):
n.store()
statistics['total'] += 1
statistics['types'][n._plugin_type_string] += 1
statistics['ctime_by_day'][n.ctime.strftime('%Y-%m-%d')] += 1

class QueryManagerDefault(AbstractQueryManager):
pass

qmanager_default = QueryManagerDefault()

current_db_statistics = qmanager_default.get_creation_statistics()
types = defaultdict(int)
types.update(current_db_statistics['types'])
ctime_by_day = defaultdict(int)
ctime_by_day.update(current_db_statistics['ctime_by_day'])

expected_db_statistics = {
'total': current_db_statistics['total'],
'types': types,
'ctime_by_day': ctime_by_day
}

ParameterData = DataFactory('parameter')

store_and_add(Node(), expected_db_statistics)
store_and_add(ParameterData(), expected_db_statistics)
store_and_add(ParameterData(), expected_db_statistics)
store_and_add(Calculation(), expected_db_statistics)

new_db_statistics = qmanager_default.get_creation_statistics()
# I only check a few fields
new_db_statistics = {k: v for k, v in new_db_statistics.iteritems() if k in expected_db_statistics}

expected_db_statistics = {k: dict(v) if isinstance(v, defaultdict) else v
for k, v in expected_db_statistics.iteritems()}

self.assertEquals(new_db_statistics, expected_db_statistics)

8 changes: 0 additions & 8 deletions aiida/restapi/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ def __init__(self, app=None, **kwargs):
self.add_resource(Calculation,
'/calculations/',
'/calculations/schema/',
'/calculations/statistics/',
'/calculations/page/',
'/calculations/page/<int:page>/',
'/calculations/<id>/',
Expand All @@ -144,7 +143,6 @@ def __init__(self, app=None, **kwargs):
self.add_resource(Data,
'/data/',
'/data/schema/',
'/data/statistics/',
'/data/page/',
'/data/page/<int:page>',
'/data/<id>/',
Expand All @@ -164,7 +162,6 @@ def __init__(self, app=None, **kwargs):
self.add_resource(Code,
'/codes/',
'/codes/schema/',
'/codes/statistics/',
'/codes/page/',
'/codes/page/<int:page>/',
'/codes/<id>/',
Expand All @@ -184,7 +181,6 @@ def __init__(self, app=None, **kwargs):
self.add_resource(StructureData,
'/structures/',
'/structures/schema/',
'/structures/statistics/',
'/structures/page/',
'/structures/page/<int:page>',
'/structures/<id>/',
Expand All @@ -205,7 +201,6 @@ def __init__(self, app=None, **kwargs):
self.add_resource(KpointsData,
'/kpoints/',
'/kpoints/schema/',
'/kpoints/statistics/',
'/kpoints/page/',
'/kpoints/page/<int:page>',
'/kpoints/<id>/',
Expand All @@ -226,7 +221,6 @@ def __init__(self, app=None, **kwargs):
self.add_resource(BandsData,
'/bands/',
'/bands/schema/',
'/bands/statistics/',
'/bands/page/',
'/bands/page/<int:page>',
'/bands/<id>/',
Expand All @@ -247,7 +241,6 @@ def __init__(self, app=None, **kwargs):
self.add_resource(User,
'/users/',
'/users/schema/',
'/users/statistics/',
'/users/page/',
'/users/page/<int:page>/',
'/users/<id>/',
Expand All @@ -257,7 +250,6 @@ def __init__(self, app=None, **kwargs):
self.add_resource(Group,
'/groups/',
'/groups/schema/',
'/groups/statistics/',
'/groups/page/',
'/groups/page/<int:page>/',
'/groups/<id>/',
Expand Down
5 changes: 2 additions & 3 deletions aiida/restapi/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,7 @@ def get(self, id=None, page=None):
## Build response and return it
headers = self.utils.build_headers(url=request.url, total_count=1)

## Treat the statistics (TODO: recoded when group_by will be
# available, it should pass by the tranlsator)
## Treat the statistics
elif query_type == "statistics":
(limit, offset, perpage, orderby, filters, alist, nalist, elist,
nelist) = self.utils.parse_query_string(query_string)
Expand All @@ -161,7 +160,7 @@ def get(self, id=None, page=None):
usr = filters["user"]["=="]
else:
usr = []
results = self.trans.get_statistics(self.tclass, usr)
results = self.trans.get_statistics(usr)

# TODO Might need to be improved
elif query_type == "tree":
Expand Down
Loading