diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/FlavorADX_Surya.md b/FlavorADX_Surya.md new file mode 100644 index 0000000..5838635 --- /dev/null +++ b/FlavorADX_Surya.md @@ -0,0 +1,139 @@ +# This is mere flavor of ADX + + + Assuming our data directory is conveniently mounted on root like `/data`. + +## Crawler + + Crawler can either run from your favorite shell as a standalone program or run in the background as daemon patiently waiting and checking for new files for indexing and analyzing. + +To run the daemon on our data directory: + + adxd /data --parser=Parse_Filterbank.py + + adxd /data --frequency * * * 59 + + adxd /data --db=localhost 34001 + + `parser` argument is nothing but a class definition which will be defined below + + `frequency` argument is just like arguments of crontabs which decides the frequency at which the daemon should check for new files. + + `db` argument provides the IP and port number to host a database. It will be ignored if user seeks astropy tables argument. + + To run the crawler from your shell: + `$ adx /data --parser=Parse_Filterbank.py` + + As mentioned above, `parser` will be discussed here. + + +## Parser + There will be a main parser class which user doesn't touch. All the info will be added to the parser class by Adx `ParserType` instance. For instance, consider this example: + + ParserType fil +Creation of `ParserType` class + + fil.AddExtenstionRule('fil') +Method of `ParserType` which adds an rule for extension. + + fil.AddFilenameRegexRule('[JB]+[0-9]{4}[+-][0-9]{2,4}_[0-9]{5}_') +Method to add a regex rule to test for files for further processing. This regex is source JName or Bname followed by MJD. + + fil.AddFilenameRule('!kur') +Method which adds filename rule. +The rule starting with ! implies logical NOT. + + fil.AddFilenameRule('kur') +Method which adds filename rule, which means, that it will only accept kurtosis files. + + fil.AddSignature('^BBX') +Method which checks for signature in the file. N.B. there is no way around to NOT open the file. +`^` means the beginning of the file. `$` means the end of the file. This is similar to regex matching. + + fil.Reader(MyFilReader) +Binds class interface to Filterbank files with the parser. If this option is provided and there are metrics in *this* `ParserType` instance which take the class interface as argument, a single instantiation of the reader class (interface class) is instantiated and passed as argument to one (or more) such metrics. *This is will further elucidated in an example* + + fil.AddFloat('mean') +Adds a data field (which is to be tabulated) with column name 'mean' and since it is a fairly common statistics, `Parser` use it's own definition of mean computation. User doesn't have to provide this implementation of mean computation. + + fil.AddFloat('smean', GiveSMean) +Similar to above, but column name is 'smean' which stands for special mean and `GiveSMean` is a callable class or function + which is user provided. If the user has provided an interface class using `Reader` method, `GiveSMean` should take that class as an argument. + + fil.AddString('polarisation', GivePol) +This method adds a string data field with name 'polarisation' and similar to above `GivePol` is a callable class or function which can take either filename or `Reader` class as argument. The exact function matching falls on the mantle of users. + + fil.AddDateTime('ctime') +There will be some fields which are captured by ADX by default. One of them is the `ctime, atime, mtime` of files which proves to be helpful in keeping track of files. + .... + +There will/can be many more options which for brevity sake are left out. Now, here comes first act of magic: + + Parser.AddParserType(fil) +This method takes the necessary stuff (all the required things it would need) and merges it with `Parser` class. + `ParserType` class can be one heavy and over-engineered to the extent of over-engineering but ONLY those which are actually are relevant (decided by `Add*` methods as called by the user gets added) are injected into the `Parser` class. + You can add multiple parsertypes in the same `Parser` class and `Parser` class knows what to do with each of them. + +This approach not only gives set of tools for the user but only constraints the structure of the code and thereby increases regularity which helps us developers in writing smooth code. Instead of telling the user "look, this method `def __call__(self)` define it to your liking and then make it return dictionary with key value pairs" we are telling user to "you know what you want? OK, add methods which we provided in `ParserType` to get whatever you want. We made sure that anything you want, our methods in `ParserType` class can get you. Once, you're done, pass that class to main `Parser` and chill" + +This approach also ensures that every file is only opened once since every metric is computed from same `File Handle`. + +### ParserType example + +Let us crawl through a directory containing Pulsar integrated profiles which have .prof extension and first line (and only the first line) is a header with + + `# MJD, Fraction-of-day, Number of periods in integration, period, DM, num-bins, polarisation, observatory-code` + + followed by num-bins number which correspond to actual data. + This is how it would look. + + ParserType prof + prof.AddExtensionRule('prof') + prof.Reader(MyProfReader) + prof.AddFloat('SN',GiveSN) + prof.AddFloat('DM',GiveDM) + prof.AddFloat('MJD',GiveMJD) + prof.AddString('PSR', 10, GiveJName) + Parser.AddParserType(prof) + +The functions used above are defined in the same place where `ParserType` is defined and they all have the following signature: + + class MyProfReader(object): + def __init__(self, filepath): + # initalization + self.dm = ... + self.data = ... + + def GiveDM(x): + return x.sn + + def GiveSN(x): + # SN computation using x.data + +## Crawler + +Coming back to crawler with our `Parser` class loaded with all the rules while still hidden by the user, can safely interact with `Crawler` class in a pre-determined fashion and just the way we developers seek. + + Parser.GetExtensions() +This method would return the extensions which were added to `Parser` class. + +Second act of magic: +`Parser` class where all the `ParserType` are injected creates regex rules for each of the `ParserType` classes. This regex rule is generated from filename, pathname, extension and is pretty robust. And, it is this rule which is passed onto to `Crawler` at the start of crawling which is used to figure out what to do. On successful match, `Parser` internally calls and computes all the metrics as asked by the user (the user is not calling an function, s/he is merely specifying what s/he wants). + +## Logger +Third and final act of magic: + +`Parser` class again comes to rescue here and tells us the schema of each of the table. `Crawler` and `Logger` can internally talk among themselves. + +## Final comments + +We can really brainstorm and add contrived `ParserType` methods such as `AddOnFile` which computes a statistics (not recorded by Logger) and performs logic based on the statistics. + + ParserType prof + prof.AddOnFile(AlertIfNoDetection) + +In the running example, `AddOnFile` binds function or callable class `AlertIfNoDetection` which takes `MyProfReader` as argument and runs some statistical test to check for detection and if it finds that there's no detection, it shouts. + +Not just it, this approach hides `Crawler` and `Logger` class from the user with the exception of `Parser` in which only one method is exposed, the `AddParserType` method. The definition of `Parser` will happen in the main body and `adx --parser=MyParser.py` uses it without defining it. + +The true power (according to me) comes from creating the tools which the user can just call and use to his/her liking and rest is taken care by ADX. \ No newline at end of file diff --git a/TODO b/TODO new file mode 100644 index 0000000..0336183 --- /dev/null +++ b/TODO @@ -0,0 +1,4 @@ +- + + +- PEP8 diff --git a/adx/Adx.py b/adx/Adx.py new file mode 100644 index 0000000..d3683cb --- /dev/null +++ b/adx/Adx.py @@ -0,0 +1,64 @@ +''' +ADX main class definition +''' +# ADX stuff +from parser import Parser +from crawler import Crawler +# other stuff +import multiprocessing as mp + +__all__ = ['Adx'] + +class Adx(object): + def __init__(self, cdir, pars, logg, + daemon=True, + verbose=0, + numthreads=1, + debug=False): + ''' + Arguments + --------- + + cdir : str, or list of str + Directory or list of directories to crawl + pars : instance of Parser or list of ParserTypes + logg : any instance of logging + + ''' + # crawl setup + self.crawler = Crawler(cdir) + # parse setup + if isinstance(Parser, pars): + self.parsers = pars + elif isinstance(list, pars): + self.parsers = Parser() + for pt in pars: + self.parsers.AddParserType(pt) + # logger setup + self.logger = logg + # misc options + self.daemon = daemon + self.debug = debug + self.verbose = 3 if debug else verbose + self.numthreads = numthreads if numthreads < mp.cpu_count() else mp.cpu_count() + # max number of threads is number of CPUs + + def __step(self,currdir, curr): + # moving mountains + # one rock at a time + # to ensure grouped and make use of InsertMany + rdict = self.parsers.parseAction(curr) + for k,v in rdict.items(): + if len(v) == 0: + continue + self.logger.InsertMany(k,v) + + def __setup(self): + # this isn't necessary anymore + # self.logger.getSchema( self.parser.putSchema() ) + pass + + def walk(self): + for pdir, curr in self.crawler: + self.__step(pdir, curr) + diff --git a/adx/parser/__init__.py b/adx/__init__.py similarity index 100% rename from adx/parser/__init__.py rename to adx/__init__.py diff --git a/adx/adx b/adx/adx new file mode 100755 index 0000000..4e68e21 --- /dev/null +++ b/adx/adx @@ -0,0 +1,127 @@ +#!/usr/bin/env python2.7 + +_VERSION_ = "0.0.1" +_PERSISTANCE_FILE_NAME_ = "/tmp/adx_persistant_logger" +_PERSISTANCE_KEY_ = 'ala' + +import argparse +import subprocess as sp +import sys +import logging + +# log setup +logging.basicConfig(format='[%(levelname)s] %(message)s') +mylog = logging.getLogger() + + + +def parseargs(): + adxargparser = argparse.ArgumentParser(prog="ADX", description="Command line interface to ADX", epilog="ADX v"+_VERSION_) + subs = adxargparser.add_subparsers(help="Commands", dest='cmd') + ## logout + lgroup = subs.add_parser("logout", help="Closes everything and logs out.") + ## schema + sgroup = subs.add_parser("schema", help="Prints schema.") + ## refresh + rgroup = subs.add_parser("refresh", help="Refreshs database.") + ## update + ugroup = subs.add_parser("update", help="Updates database.") + ## crawl + crawlgroup = subs.add_parser("crawl", help="Crawl action.") + addarg = crawlgroup.add_argument + addarg("-d,dir", help="Directories", action='store', nargs='*', dest='DIRS') + addarg("-p,parse", help="ParserType files", action='store', nargs='*', dest='PTS') + ## connect + cgroup = subs.add_parser("connect", help="Connect help") + addarg = cgroup.add_argument + addarg("-n,name", help="Name of the ADX/Project", default='adx', dest='NAME') + addarg("--one-session", action='store_true', default=True, dest='persist', help="Flag to make persist connection.") + ## db options + cgroup_subs = cgroup.add_subparsers(help="Interfaces", dest='interface') + dbcgroup = cgroup_subs.add_parser("mongodb", help="MongoDB interface") + addarg = dbcgroup.add_argument + addarg("--connect", help='IP to connect to database.', default='127.0.0.1:27017', metavar='X.X.X.X:P') + ## tab options + tabcgroup = cgroup_subs.add_parser("table", help="Astropy Table interface") + addarg = tabcgroup.add_argument + addarg("--tabpath", help="Table path.", default='.') + ## query + qgroup = subs.add_parser('query',help="Query help") + addarg = qgroup.add_argument + addarg("--par", help="Parameter to query") + addarg("--cond", help="Condition") + addarg("--exec", help="Execute afer finding") + addarg("--out", help="Output filepaths") + addarg("--explain", help="Explain query") + addarg("--absolute", action='store_true', help="Return absolute paths") + addarg("parsertype", help="The parserType", nargs=1) + addarg("query", help="JSON-like query\nYou will need to quote it.") + #### + return adxargparser.parse_args() + +def main(): + ### this function returns + logger = None + persister = None + ### + opts = parseargs() + if opts.cmd == 'schema': + print "requested schema" + elif opts.cmd == 'connect': + print "requested connect" + if opts.interface == 'mongodb': + print "requested mongodb" + print "Connect at ", opts.connect + print "DBname is ", opts.NAME + import mongodbio + logger = mongodbio.dbio(name = opts.connect, dbname = opts.NAME) + print type(logger) + elif opts.interface == 'table': + print 'requested tabio' + print "tabpath", opts.tabpath + import tabio + logger = tabio.tabio() + if opts.persist: + print "Asked for persistance" + # same filename + from adxshelver import Shelver + persister = Shelver(_PERSISTANCE_FILE_NAME_) + persister.save(_PERSISTANCE_KEY_, logger) + # XXX ala is for now a hack. + # you need a persistent name too + elif opts.cmd == 'crawl': + # from imp import load_source + # XXX this is deprecated!!! + import importlib + print "requested crawl" + print "DIRS:", opts.DIRS + print "PT files", opts.PTS + pts = [importlib.import_module(p) for p in opts.PTS] + print pts + elif opts.cmd == 'refresh': + print "requested refresh" + elif opts.cmd == 'update': + print "requested update" + elif opts.cmd == 'query': + pt = opts.parsertype[0] + print "queried parsertype", pt + import ast + query = ast.literal_eval(opts.query) + print "requested query", opts.query + from adxshelver import Shelver + persister = Shelver(_PERSISTANCE_FILE_NAME_) + logger = persister.get(_PERSISTANCE_KEY_) + logger.Query(pt, query) + elif opts.cmd == 'logout': + print "requested logout" + from adxshelver import Shelver + persister = Shelver(_PERSISTANCE_FILE_NAME_) + persister.close() + ### graceful termination + logger and logger.close() + persister and persister.close() + print "Exiting main" + return opts + +if __name__ == '__main__': + opts = main() diff --git a/adx/adx_persistance_logger b/adx/adx_persistance_logger new file mode 100644 index 0000000..ef8707b Binary files /dev/null and b/adx/adx_persistance_logger differ diff --git a/adx/adxceptions.py b/adx/adxceptions.py new file mode 100644 index 0000000..1fb086c --- /dev/null +++ b/adx/adxceptions.py @@ -0,0 +1,9 @@ +''' +Class definitions of all the exceptions passed around +by ADX +''' +class ADXception(Exception): + pass + +class ADXLogImportError(ADXception): + pass diff --git a/adx/adxshelver.py b/adx/adxshelver.py new file mode 100644 index 0000000..fca3faa --- /dev/null +++ b/adx/adxshelver.py @@ -0,0 +1,31 @@ +''' +Shelving class +''' +import shelve as sh + +class Shelver(): + ''' + Manages the shelving actions + ''' + def __init__(self, filename): + self.shelf = sh.open(filename) + self.filename = filename + + def close(self): + self.shelf.close() + # delete file + + def list(self): + return self.shelf.keys() + + def save(self, k, v): + self.shelf[k] = v + + def get(self, k): + return self.shelf[k] + + def savefilename(self,path, session): + f = open(path + session,'w') + f.write(self.filename) + f.close() + diff --git a/adx/crawler.py b/adx/crawler.py index f7c754a..73b638e 100644 --- a/adx/crawler.py +++ b/adx/crawler.py @@ -1,107 +1,31 @@ -"""crawler.py defines the ADX crawler and supporting functions. - The functionality of ADX crawler. - 1. Navigate through one directory - 2. Check item type - 3. Log data file information. -""" - +''' +Crawler class definition +''' import os - class Crawler: - """ Crawler is designed to traverse all the items including the - subdirectories in a directory. After the crawling, it returns a list of - target files and their parser. - - Parameter - --------- - dir_name : str - Directory name - parsers : list - A list of `Pareser object` for each file type. - recusive: bool, optional - The flag that tells if the crawler to go through the subdirecotries. - The default is 'True'. - - Note - ---- - Crawler only parses the file types that have the `Parser` object - provided. - """ - def __init__(self, dir_name, parsers, recusive=True): - self.root_dir = dir_name - self.parsers = parses - # If the parser_template does not include the customer defined parser, - # it will add the default directory parser. - if 'directory' not in self.parser_template: - self.parser_template.update({'directory': ParserDir}) - self.recusive = recusive - self.cur_location = dir_name - self.cur_dir_info = ParserDir(self.cur_dir) - self.visited = [self.cur_location,] - self.set_up() - - def set_up(self): - """ This function prepares the crawler. It does the following steps: - 1. Gather the file types information for the parsers. - 2. Build the extension map - .... - """ - self.target_types = [] - self.ext_map = {} - # get all the types and build the extension map. - for p in self.parsers: - self.target_types.append(p.file_type) - for ext in p.extensions: - if ext not in self.ext_map.keys(): - self.ext_map[ext] = [p,] - else: - self.ext_map.append(p) - - def _check_ext(self, item): - """ Check item's extension. - - Parameter - --------- - item : str - The full path to the item. - - Return - ------ - The item extension as a string. If the item is a directory, it - returns 'directory'. - """ - if os.path.isdir(item): - return 'directory' - else: - filename, file_type = os.path.splitext(item) - return file_type - - def get_parser(self, item): - """Get the right parser for the item according to the item type. The - item type will be indentified by file extension checking and parser's - double checking. - - Parameter - --------- - item : str - The full path to the item. - Return - ------ - The Parser class - """ - item_ext = self._check_ext(item) - if item_ext not in self.ext_map: - return - else: - for tp in self.ext_map(item_ext): - if tp.check_type(item): - return tp - else: - return - - # TODO add functions to read the old file lists and not crawl the loged - # files + ''' + Crawler class + + + N.B. Delegate all multi threading to ADX class + as multi-threading in CPython negatively affects performance due to + Global Interpreter Lock. + Make use of multiprocessing. + ''' + def __init__(self, + cdir, + followLinks = False, + ): + self.workd = [] + if isinstance(cdir, str): + cdir = [ cdir ] + self.workd = self.workd + cdir + self.flinks = followLinks + + def __iter__(self): + # yield a tuple dawg + for cd in self.workd: + for dirpath, dirs, files in os.walk(cd, followlinks = self.flinks): + yield dirpath, files - def crawl_dir(self, recusive=True): - pass diff --git a/adx/logger.py b/adx/logger.py index 7708391..e8f4cf1 100644 --- a/adx/logger.py +++ b/adx/logger.py @@ -1,54 +1,78 @@ -"""Logger.py defines the class for recording the parsed information and create -the indexing table. -""" -from astropy import log - - -class Logger: - """Logger class is designed to record the file or directory information to - a data table. A logger operates on a list of files (with full path) and the - associated file parser. The output file can be customerized by the user - - Parameters - ---------- - target_items : dict - The items need to be parsed. The key is the full path to the item, and - the value is the parser for the item type. - """ - def __init__(self, target_items): - self.target_items = target_items - self.set_up_table() - - @property - def items(self): - return list(self.target_items.keys()) - - @property - def parsers(self): - return list(set(self.target_items.keys())) - - @property - def types(self): - parsers = list(set(self.target_items.keys())) - t = [p.file_type for p in parsers] - return t - - def set_up_table(self): - self.tables_cols = {} - for prs in self.parsers: - self.tables_cols[prs] = [pf[0] for pf in prs.parse_funcs] - - def get_info_entry(self, item, parser): - """ Parse the item information following the table column. - """ - info = parser(item) - entry = () - for tc in self.tables_cols[parser.file_type]: - entry += (info[tc],) - return entry - - def log_info(self): - pass - - def write_info(self): - pass +''' +Logger class definition +''' + +from adx.io import dbio +from adx.io import tabio + + +class Logger(dbio, tabio): + ''' + Logger class + ''' + def __init__(self, name=None, numtables=1, writeDB=None): + if self.validIP(name): + # need to write to a db + self.sqldb = True + self.log = dbio(ip, port, writeDB) + if writeDB is None: + self.writedb = name + else: + self.writedb = writeDB + else: + # need to write tables + self.logdir = name + + def __addSchema(self, name, dtype): + if name in self.schema: + raise ADXLogError("Schema ill-defined.") + self.schema.append(name) + self.schema_dtype.append(dtype) + + def __prepareLog(self): + self.log() + + def getSchema(self, schema): + ''' + Private method which gets schema from Parser objects. + + Parameters + ---------- + + schema : list or str + schema <- 'sn:float' + schema <- 'backend:str(256)' + schema <- [ 'sn:float', 'backend:str(256)' ] + + Note + ---- + + You shouldn't probably call this on your own unless you know what you're doing. + ''' + if isinstance(schema, list): + # multiple schemas + for s in schema: + par, val = s.split(':') + self.__addSchema(par, val) + elif isinstance(schema,str): + # only one schema + par, val = schema.split(':') + self.__addSchema(par, val) + + def log(self, pt, ddir, data): + ''' + Single atomic action resolution for a filepath + + Parameters + ---------- + + data : dict ? + which is to be retured by the Parser + ''' + if pt not in self.schema: + raise ADXLogError("ParserType not recognized.") + if self.sqldb: + self.__writeDB(ddir, data) + # INSERT INTO ?(pt) VALUES (?,?,?...) + else self.atables: + self.__writeTab(ddir, data) diff --git a/adx/mongodbd.py b/adx/mongodbd.py new file mode 100644 index 0000000..443bddd --- /dev/null +++ b/adx/mongodbd.py @@ -0,0 +1,17 @@ +import subprocess + +class dbd: + def __init__(self, + dbpath, + bind_ip = 'localhost', + port = 87654, + maxc = 5, + daemon = True, + logpath = None, + ): + # python daemon ???? + # Having a separate class for dbdaemon? + # check if mongod is already running + # mongorestore + # mongodump + if diff --git a/adx/mongodbio.py b/adx/mongodbio.py new file mode 100644 index 0000000..7a9a804 --- /dev/null +++ b/adx/mongodbio.py @@ -0,0 +1,65 @@ +'''Handles db io''' + +from adxceptions import ADXLogImportError + +try: + import pymongo as pmdb +except ImportError: + raise ADXLogImportError("Mongodb not found/installed.") + +class dbio(pmdb.MongoClient): + def __init__(self, + name = 'localhost:27017', + dbname = 'name' + ): + ip, port = name.split(':') + super(dbio, self).__init__(name) + self.name = name + self.dbname = dbname + self.ptypes = super(dbio, self).__getitem__(self.dbname).list_collection_names() + self.schema = {ix:[] for ix in self.ptypes} + + def __getitem__(self, key): + return pmdb.MongoClient.__getitem__(self, self.dbname).__getitem__(key) + + def __feedSchema__(self, schema): + # schema be a dict. + # each key is pt + # each value is iterable + for k,v in schema.iteritems(): + if k in self.ptypes: + # ptypes was initialized before? + # okay.. + self.schema[k] = self.schema[k] + v + else: + self.ptypes.append(k) + self.schema[k] = v + assert set(self.schema.keys()) == set( self.ptypes ) + # we anyway shouldn't have any duplicates smh + + def __reduce__(self): + # to make pickled + # https://stackoverflow.com/questions/19855156/whats-the-exact-usage-of-reduce-in-pickler + return (self.__class__, (self.name, self.dbname)) + + + def Query(self, qpt, query, projection = "{_id: 0}"): + # need to decide on the grammar + # TODO it be fun to implement this + return self[qpt].find(query, projection) + + def InsertOne(self, qpt, payload): + return self[qpt].insert_one(payload) + + def InsertMany(self, qpt, payload): + return self[qpt].insert_many(payload) + + def DeleteOne(self, qpt, pred): + pass + + def Close(self): + # CLOSE DB + self.close() + + + diff --git a/adx/parser.py b/adx/parser.py new file mode 100644 index 0000000..86a9ee4 --- /dev/null +++ b/adx/parser.py @@ -0,0 +1,353 @@ +''' +Main Parser class implementation where all the ParserType stuff is injected into +''' + + +__all__ = ['Parser','ParserType'] + +import re + +class Parser: + ''' + ain't nobody got time to write the doc string + + Attributes + ---------- + + __rule : list + List of callables which return bool + __parsertype : list + List of identifiers of parsertype. + Identifiers need to be unique. + Elements of this list are used as keys everywhere. + __schema : dict + Dictionary with keys as __parsertype and + values as schema requested + __actions: dict + Dictionary with keys as __parsertype and + values as callable which returns scalar or list + depending on the __schema + __reader: dict + If any class interface was given, it's loaded + + Assertions + ---------- + __rule <1-1> __parsertype + One to one mapping + __schema.keys() == __actions.keys() <== keys from __parsertype + ''' + + def __init__(self): + ''' + Sets up environment + ''' + self.__rule = list() + self.__parsertypes = list() + self.__actions = dict() # 1-1 on parsertypes + self.__schema = dict() + self.__reader = dict() + self.__dtypes = dict() + + def __SchemaToLogger(self): + ''' + Method to get schema as received by the Parser to our brother Logger + + Do we actually need it? + > In db case, we need it to indexing purposes + > In tab case, we NEED NEED it. + Note + ---- + + Tis be a private method. Thou shallnt call thine. + ''' + # YOLO -- delegating work to future Surya + return self.__schema + + def parseAction(self, filelist): + ''' + Method which does the work. + Our brother Crawler isn't helping us so it is upto Parser to decide what to do with the file in hand. + + Parameters + ---------- + + filelist : list, or iterable + List of files + + Returns + ------ + ret : dict + ''' + ret = {ipt:[] for ipt in self.__parsertypes} + # TODO irule matching + # TODO signature matching + # TODO statistics extraction + # TODO Return stats to Logger + for fl in filelist: + k,v = self.__parser(fl) + if k is None: + continue + ret[k].append(v) + return ret + + def __parser(self, filepath): + ''' + Method which does the work. + Our brother Crawler is helping us by telling us PtIndex. + PtIndex is index of the ParserType which matches the file at hand. + + Parameters + ---------- + + filepath : str + The entire filepath + + Note + ---- + Our brother Crawler might make mistake (but, he's our brother afterall), so if we get ADXMisMatchError anywhere, we fall back to ParserType resolution. + Do we???? + ''' + for ipt,rulz in zip(self.__parsertypes, self.__rule): + if rulz(filepath): + if self.__reader[ipt] is not None: + # this ensures that file is read ONLY once + fre = self.__reader[ipt]( filepath ) + else: + fre = filepath + return ipt, {sch:act(fre) for sch,act in zip(self.__schema[ipt],self.__actions[ipt])} + # This above line is beautiful + return None, None + + def AddParserType(self, pt): + ''' + Only method exposed to the user for injection. + + Parameters + ---------- + pt : ParserType object + An instance of a ParserType object which contains all the rules. + + Note + ---- + This is like the wedding of ParserType into Parser. + ''' + ## yo mama gonna get hitched + # name + self.__parsertypes.append( pt.name ) + # rule + self.__rule.append( pt.rule ) + # schema + self.__schema[pt.name] = pt.schema + # action + self.__actions[pt.name] = pt.funcs + # dtype + self.__dtypes[pt.name] = pt.dtypes + # reader + self.__reader[pt.name] = pt.reader + +class ParserType: + ''' + yo mama class + ''' + def __init__(self, name): + ''' + + Parameters + ---------- + + name : str + Should be fooking unique, dawwwg + + Note + --- + This class will be heavy + ''' + self.name = name # name identifier, should be fooking unique + # extension + self.__extension = '' + # filename + self.__filename = '' + # Signature stuff + self.__signature = None + self.__signature__begin = False + self.__signature__end = False + # wedding stuff + self.rule = None # callable biatch + self.schema = [] # holds the schemas + self.funcs = [] # holds the semantic action + self.dtypes = [] # holds the return types + self.reader = None # do you even optimize bro? + + def AddExtensionRule(self, extension): + ''' + Method to add an extension rule. + + Parameter + -------- + extension : str + Extension of the files to be trapped. + ''' + self.__extension = extension + self.__CreateRegex() + + def AddFilenameRule(self, filename): + ''' + Method to add filename rule. + + Parameters + ---------- + + filename : str + Part of the filename to check for while trapping + ''' + self.__filename = filename + self.__CreateRegex() + + def AddFilenameRegexRule(self, regexp): + ''' + Method to add regex rule for filename based trapping + + Parameters + ---------- + regexp : str + Regular expression which should be compiled successfully using Python re module + + Note + ---- + `ParserType` will not generate a regex rule if the user provides this. + ''' + try: + rex = re.compile(regexp) + except re.error: + raise ValueError('cannot prepare regex') + self.rule = lambda x : rex.match(x) is not None + + def __CreateRegex(self): + ''' + Private method to create ultimate regex rule which will be passed to Parser. + + Will be called after AddExtensionRule, AddFilenameRule, AddFilenameRegexRule + Note + ---- + This method is not exposed to user. It will be internally called. + ''' + rrule = '$.*' + self.__filename + '.*.' + self.__extension + '$' + try: + rex = re.compile(rrule) + except re.error: + raise ValueError('Cannot make regex rule') + self.rule = lambda x : rex.match(x) is not None + + def AddSignature(self, signature): + ''' + Method to add a signature check. + + Parameters + ---------- + + signature : str + Signature which will be used to check file + + Note + ---- + There are some special characters that can be used in defining signature: + - If the signature begins with '^' (caret) followed by text, the first few lines of the text is read and checked against the text to verify the signature. + - If the signature ends with '$' (dollar sign), the file under question is checked if it ends with the text preceeding '$'. + + For example, + - '^BBX' means file is passed ONLY if the first three characters read from the top of the file match 'BBX' + - 'XBB$' means file is passed ONLY if the last three characters read from the end of the file match 'XBB' + ''' + if signature[0] == '^': + self.__signature__begin = True + self.__signature__end = False + elif signature[0] == '$': + self.__signature__begin = False + self.__signature__end = True + self.__signature = signature + + def Reader(self, readme): + ''' + Method to bind an interface class to interact with the files. + + Parameters + ---------- + + readme : Function or Callable class + + Note + ---- + + Define your interface class in the same place where you're defining your ParserType classes + ''' + self.reader = readme + + def AddPar(self, description, func, dtype='float', length=None): + ''' + Why have two functions when one function can do the job? + ''' + self.dtypes.append( dtype if length else dtype+str(length) ) + self.schema.append( description ) + self.funcs.append( func ) + + def AddFloat(self, description, func): + ''' + Method to add a statistics to be tracked by ADX. + + Parameters + ---------- + + description : str + A short string which will be used as a column name in database or AstroPy Tables + + func : Function or callable class + IMP func should either take a `reader` class or `filename` as argument + + Note + ---- + + Make your life easier by adding a reader class to your `ParserType` instantiation and defining your `func` to take a reader class object as argument. + ''' + self.dtypes.append( 'float' ) + self.schema.append( description ) + self.funcs.append( func ) + + def AddString(self, description, length, func): + ''' + Method to add a string based statistics to be tracked by ADX + + Parameters + ---------- + + description : str + A short string which will be used as a column name in database or AstroPy Tables + + length : int + Maximum length of the string. + + func : Function or callable class + IMP func should either take a `reader` class or `filename` as argument + ''' + self.dtypes.append( 'string'+str(length) ) + self.schema.append( description ) + self.funcs.append( func ) + + def AddOnFile(self, func): + ''' + Method to bind a function call action on every file which is parsed. + + Parameters + ---------- + func : Function or callable class + IMP func should either take a `reader` class or `filename` as argument + + Note + ---- + This seems so useful rn. F this shit. + Now, I don't know it felt so important then. + ''' + self.__onfilefunc = func + + def AddDirectoryRule(self): + # TODO + pass diff --git a/adx/parser/parser.py b/adx/parser/parser.py deleted file mode 100644 index 15a790f..0000000 --- a/adx/parser/parser.py +++ /dev/null @@ -1,109 +0,0 @@ -""" parser.py implements the parser base class, which defines the base API for - different parser type. -""" - -import os -from astropy import log - - -__all__ = ["ParserBase", "DirParser"] - - -class ParserBase: - """Base class for all types of parser. - - ParserBase class defines the high-level API for the subclass of parser. It - contains the wrapper methods that call the file type check functions and - all the user defined parse functions. - - The Parser class is a callable class which returns all the information - requested by the user. - - Parameters - ---------- - file_type : str - Type name. - extensions: str or list - All the acceptable file extensions. - - Atributes - --------- - parse_funs: tuple - The request information name and the method/callable functions to get - the information. The first element is the request information name (in - ADX this attribute will be used as the database column name.), and the - second value is the callable fucntions that parse the information. - One should organize the parse functions accordingly. - The default is an empty tuple. - """ - def __init__(self, file_type, extensions): - self.file_type = file_type - if isinstance(extensions, str): - extensions = [extensions,] - self.extensions = extensions - self.parse_funcs = () - - def __call__(self, filename, **kwargs): - """ High-level parse_info method. - - Paremeters - ---------- - filename : str - Full path to the file. - **kwargs : - Addtional input to the parse functons. - - Note - ---- - This if for the general purpose, however, it can be redefined in the - subclass. - """ - if not self.check_type(filename): - return None - else: - if parse_funcs == (): - raise ValueError("Parser needs parse functions.") - result = {} - for k, f in self.parse_funs: - result[k] = f(filename, **kwargs) - return result - - def check_type(self, filename): - """ User defined file type checker. - - Paremeter - --------- - filename : str - The file name - Return - ------ - If the file belongs to the defined type, return True, otherwrise - False. - """ - raise NotImplementedError - - -class DirParser(ParserBase): - """ A parser class for the directory. The directory parser is designed to - collect the over all information from a directory. It is able to read the old - directory logs, if provided, and provide updates on the directory - information. If the log is not provided, it will create a directory log - based on the current status. - """ - def __init__(self): - super().__init__('directory', 'directory') - - def set_up(self): - pass - - def check_type(self, filename): - return os.path.isdir(filename) - - def read_logs(self, input_log=None, log_format=None): - pass - - def cur_item_num(self, dir_name): - return len(os.path.listdir(dir_name)) - - def last_update(self, dir_name): - pass diff --git a/adx/tabio.py b/adx/tabio.py new file mode 100644 index 0000000..79940c8 --- /dev/null +++ b/adx/tabio.py @@ -0,0 +1,19 @@ +'''Handles table/db io''' + +from astropy import table as at + +class tabio: + def __init__(self, name='tabio'): + pass + + def Write(self, pt, payload): + # Astropy table io? + pass + + def Query(self): + # need to decide on the grammar + # TODO it be fun to implement this + pass + + def close(self): + pass diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..687776c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +os +subprocess +pymongo +json + + diff --git a/tests/mongodb.sh b/tests/mongodb.sh new file mode 100644 index 0000000..999e5cf --- /dev/null +++ b/tests/mongodb.sh @@ -0,0 +1,11 @@ +# some commands which would come in handy + +# imports db +mongoimport --db ? --collection ? --drop ? --file ? +mongorestore ? +# starts db +mongod --fork --logpath --dbpath ? --maxConns ? --port ? --bind_ip ? +# dumps db +mongodump -d ? -o ? + + diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..09acb1e --- /dev/null +++ b/tests/test.py @@ -0,0 +1,17 @@ +from adx import Adx +from adx.parser import parser +from adx.crawler import crawler +from adx.logger import logger + +myCrawl = crawler(pdir, batchsize=100) + +log = logger('localhost:84001',writeDB='./save.db', numtables=1) + +profpar = parser('^\d{8}_\d{6}_[BJ]\d{4}[+-]\d{4}.prof$') +profpar.AddFloat('SN', getSN) +profpar.AddString('Source', getSource) +# +myparse = parser() +myparse.AddParserType(profpar) + +myAdx = Adx(crawler=myCrawl, logger=log, parser=myparse) diff --git a/tests/test.sh b/tests/test.sh new file mode 100644 index 0000000..52bbbe2 --- /dev/null +++ b/tests/test.sh @@ -0,0 +1,15 @@ +adx connect localhost:84001 --one-session +adx schema +adx query -par sn --cond gt 5 --cond lt 10 +adx query -par mjd --cond gt 58255 +adx query "$or : {$sn $lt 5}, {$mjd $gt 58255}" +# 58255 = 05/17/2018 +adx query -par sn --cond gt 3 --exec {} \; +# this is like find exec +adx refresh +# this re-crawls and updates the db +adx disconnect +### oneline +adx --connect localhost:84005 query -par mjd --cond gt 57852 --exec {} \; + +adx /data --db vlbi --port 84001 --daemon diff --git a/tests/test1.py b/tests/test1.py new file mode 100644 index 0000000..b9addfb --- /dev/null +++ b/tests/test1.py @@ -0,0 +1,18 @@ +from adx import Adx +from adx.parser import parser +from adx.crawler import crawler +from adx.logger import logger + +myCrawl = crawler(pdir, batchsize=100) +myCrawly = crawler(opdir, batchsize=100) + +log = logger('localhost:84001',writeDB='./save.db', numtables=1) + +profpar = parser('^\d{8}_\d{6}_[BJ]\d{4}[+-]\d{4}.prof$') +profpar.AddFloat('SN', getSN) +profpar.AddString('Source', getSource) +# +myparse = parser() +myparse.AddParserType(profpar) + +myAdx = Adx(crawler=[myCrawl,myCrawly], logger=log, parser=myparse) diff --git a/tests/test_cand_pt.py b/tests/test_cand_pt.py new file mode 100644 index 0000000..5ac478d --- /dev/null +++ b/tests/test_cand_pt.py @@ -0,0 +1,6 @@ +from adx.parser import ParserType +path = "/home/shining/study/MS/vLITE/mkerr/cands" +filpt = ParserType('cand') +rgrule = "^\d{8}_\d{6}_muos_ea\d{2}_.cand$" +filpt.AddFilenameRegexRule(rgrule) +filpt.AddString('filename', 30, lambda x : x) diff --git a/tests/test_crawlparse.py b/tests/test_crawlparse.py new file mode 100644 index 0000000..cb4a4bb --- /dev/null +++ b/tests/test_crawlparse.py @@ -0,0 +1,15 @@ +from adx.parser import Parser +from adx.parser import ParserType +from adx.adx import Adx # I don't like this way of importing +############################### +path = "/home/shining/study/MS/vLITE/mkerr/fil" +pars = Parser() +### +filpt = ParserType('fil') +rgrule = "^\d{8}_\d{6}_muos_ea\d{2}_kur.fil$" +filpt.AddFilenameRegexRule(rgrule) +filpt.AddString('filename', 30, lambda x : x) +### +pars.AddParserType(filpt) +############################### +myadx = Adx(path, pars) diff --git a/tests/test_fil_pt.py b/tests/test_fil_pt.py new file mode 100644 index 0000000..b572853 --- /dev/null +++ b/tests/test_fil_pt.py @@ -0,0 +1,7 @@ +from adx.parser import ParserType +path = "/home/shining/study/MS/vLITE/mkerr/fil" +filpt = ParserType('fil') +rgrule = "^\d{8}_\d{6}_muos_ea\d{2}_.fil$" +filpt.AddFilenameRegexRule(rgrule) +filpt.AddString('filename', 30, lambda x : x) + diff --git a/tests/test_kurfil_pt.py b/tests/test_kurfil_pt.py new file mode 100644 index 0000000..3f778e2 --- /dev/null +++ b/tests/test_kurfil_pt.py @@ -0,0 +1,6 @@ +from adx.parser import ParserType +path = "/home/shining/study/MS/vLITE/mkerr/fil" +filpt = ParserType('kfil') +rgrule = "^\d{8}_\d{6}_muos_ea\d{2}_kur.fil$" +filpt.AddFilenameRegexRule(rgrule) +filpt.AddString('filename', 30, lambda x : x) diff --git a/tests/toy.py b/tests/toy.py new file mode 100644 index 0000000..1fd9d06 --- /dev/null +++ b/tests/toy.py @@ -0,0 +1,23 @@ +# imports +import Parser, ParserType, Crawler, Logger +# definition +Parser myParser +Crawler myCrawler +Logger myLogger +############################# +import toy_my_parsers.py +''' +While thinking more about it, I realized that we don't have to explicitly call +AddParserType methods of myParser, we can use +globals() +method to iterate over all the objects in the global scope and marry them ourselves. + +This way the user neednot bother with ensuring the parser object have the same name. +''' +############################# +# let's make some friends +myCrawler.__RulesToCrawler( myParser.__RulesToCrawler() ) +myLogger.__ReceiveSchema( myParser.__SchemaToLogger() ) + +# enough talk, let's crawl +myCrawler.DoYourThing() diff --git a/tests/toy_my_parsers.py b/tests/toy_my_parsers.py new file mode 100644 index 0000000..7902a4a --- /dev/null +++ b/tests/toy_my_parsers.py @@ -0,0 +1,27 @@ +''' +So this is the only file which user will have to provide +''' +from somewhere import GiveDM, GiveSN, GiveMJD, GiveJName +from somewhereelse import MyFilReader +# defining my parsertype +ParserType prof +prof.AddExtensionRule('prof') +prof.Reader(MyProfReader) +prof.AddFloat('SN',GiveSN) +prof.AddFloat('DM',GiveDM) +prof.AddFloat('MJD',GiveMJD) +prof.AddString('PSR', 10, GiveJName) +# marrying parsertype with Parser +# read globals +myParser.AddParserType(prof) + +# defining another parsertype +ParserType fil +fil.AddExtenstionRule('fil') +fil.AddFilenameRule('kur') +fil.Reader(MyFilReader) +fil.AddFloat('smean', GiveSMean) +# marrying parsertype with Parser +# read globals +myParser.AddParserType(fil) +