diff --git a/.vscode/.ropeproject/config.py b/.vscode/.ropeproject/config.py new file mode 100644 index 0000000..dee2d1a --- /dev/null +++ b/.vscode/.ropeproject/config.py @@ -0,0 +1,114 @@ +# The default ``config.py`` +# flake8: noqa + + +def set_prefs(prefs): + """This function is called before opening the project""" + + # Specify which files and folders to ignore in the project. + # Changes to ignored resources are not added to the history and + # VCSs. Also they are not returned in `Project.get_files()`. + # Note that ``?`` and ``*`` match all characters but slashes. + # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' + # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' + # '.svn': matches 'pkg/.svn' and all of its children + # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' + # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' + prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', + '.hg', '.svn', '_svn', '.git', '.tox'] + + # Specifies which files should be considered python files. It is + # useful when you have scripts inside your project. Only files + # ending with ``.py`` are considered to be python files by + # default. + # prefs['python_files'] = ['*.py'] + + # Custom source folders: By default rope searches the project + # for finding source folders (folders that should be searched + # for finding modules). You can add paths to that list. Note + # that rope guesses project source folders correctly most of the + # time; use this if you have any problems. + # The folders should be relative to project root and use '/' for + # separating folders regardless of the platform rope is running on. + # 'src/my_source_folder' for instance. + # prefs.add('source_folders', 'src') + + # You can extend python path for looking up modules + # prefs.add('python_path', '~/python/') + + # Should rope save object information or not. + prefs['save_objectdb'] = True + prefs['compress_objectdb'] = False + + # If `True`, rope analyzes each module when it is being saved. + prefs['automatic_soa'] = True + # The depth of calls to follow in static object analysis + prefs['soa_followed_calls'] = 0 + + # If `False` when running modules or unit tests "dynamic object + # analysis" is turned off. This makes them much faster. + prefs['perform_doa'] = True + + # Rope can check the validity of its object DB when running. + prefs['validate_objectdb'] = True + + # How many undos to hold? + prefs['max_history_items'] = 32 + + # Shows whether to save history across sessions. + prefs['save_history'] = True + prefs['compress_history'] = False + + # Set the number spaces used for indenting. According to + # :PEP:`8`, it is best to use 4 spaces. Since most of rope's + # unit-tests use 4 spaces it is more reliable, too. + prefs['indent_size'] = 4 + + # Builtin and c-extension modules that are allowed to be imported + # and inspected by rope. + prefs['extension_modules'] = [] + + # Add all standard c-extensions to extension_modules list. + prefs['import_dynload_stdmods'] = True + + # If `True` modules with syntax errors are considered to be empty. + # The default value is `False`; When `False` syntax errors raise + # `rope.base.exceptions.ModuleSyntaxError` exception. + prefs['ignore_syntax_errors'] = False + + # If `True`, rope ignores unresolvable imports. Otherwise, they + # appear in the importing namespace. + prefs['ignore_bad_imports'] = False + + # If `True`, rope will insert new module imports as + # `from import ` by default. + prefs['prefer_module_from_imports'] = False + + # If `True`, rope will transform a comma list of imports into + # multiple separate import statements when organizing + # imports. + prefs['split_imports'] = False + + # If `True`, rope will remove all top-level import statements and + # reinsert them at the top of the module when making changes. + prefs['pull_imports_to_top'] = True + + # If `True`, rope will sort imports alphabetically by module name instead + # of alphabetically by import statement, with from imports after normal + # imports. + prefs['sort_imports_alphabetically'] = False + + # Location of implementation of + # rope.base.oi.type_hinting.interfaces.ITypeHintingFactory In general + # case, you don't have to change this value, unless you're an rope expert. + # Change this value to inject you own implementations of interfaces + # listed in module rope.base.oi.type_hinting.providers.interfaces + # For example, you can add you own providers for Django Models, or disable + # the search type-hinting in a class hierarchy, etc. + prefs['type_hinting_factory'] = ( + 'rope.base.oi.type_hinting.factory.default_type_hinting_factory') + + +def project_opened(project): + """This function is called after opening the project""" + # Do whatever you like here! diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..adc9fab --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.nosetestsEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md index cd4ca3a..9b5660b 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -18,3 +18,5 @@ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +Further revisions to the Python3 2020 Benjamin Schmidt diff --git a/README.md b/README.md index b573b02..4fa14b5 100755 --- a/README.md +++ b/README.md @@ -238,6 +238,24 @@ Once this works, you can use various libraries to query the endpoint, or create an HTML page that builds off the endpoint. See the (currently underdeveloped) Bookworm-Vega repository for some examples. +## Pre-tokenized data. + +If you're using data that's already been tokenized, it can be ingested +by using a different file than 'input.txt' or 'input.txt.gz'. + +``` +bookworm --feature-counts unigrams.txt --feature-counts bigrams.txt build all +``` + +The format for `unigrams.txt` is a little wonky. It should consist of one row +per document. The first element is the identifier, followed by a tab. The next element +should be a CSV file that uses the formfeed character (`\f`) instead of the newline +to separate records. + +``` +id\t{word,count csv} + +``` ## Production servers diff --git a/bookwormDB/CreateDatabase.py b/bookwormDB/CreateDatabase.py deleted file mode 100755 index f2ee016..0000000 --- a/bookwormDB/CreateDatabase.py +++ /dev/null @@ -1,613 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import MySQLdb -import re -import json -import os -from .variableSet import variableSet -from .variableSet import splitMySQLcode -from bookwormDB.configuration import Configfile -from configparser import NoOptionError -import logging -import warnings -from .sqliteKV import KV - -#if logging.getLogger().isEnabledFor(logging.DEBUG): - # Catch MYSQL warnings as errors if logging is set to debug. -# warnings.filterwarnings('error', category=MySQLdb.Warning) # For testing - -warnings.filterwarnings('ignore', 'Table .* already exists') -warnings.filterwarnings("ignore", ".*Can't create database.*; database exists.*") -warnings.filterwarnings("ignore", ".*Unknown table.*") -warnings.filterwarnings("ignore", "Table 'mysql.table_stats' doesn't exist") -warnings.filterwarnings("ignore", "Data truncated for column .*") -warnings.filterwarnings("ignore", "Incorrect integer value.*") - -class DB(object): - def __init__(self, dbname = None): - if dbname == None: - self.dbname = config.get("client","database") - else: - self.dbname = dbname - if not re.match("^[A-Za-z0-9_]+$", self.dbname): - raise NameError("Database names must not include any spaces or special characters") - self.conn = None - - def connect(self, setengine=True): - #These scripts run as the Bookworm _Administrator_ on this machine; defined by the location of this my.cnf file. - conf = Configfile("admin") - try: - host = conf.config.get("mysqld", "host") - except NoOptionError: - host = "localhost" - connect_args = { - "user": conf.config.get("client", "user"), - "passwd": conf.config.get("client", "password"), - "host": host, - "use_unicode": 'True', - "charset": 'utf8', - "db": '', - "local_infile": 1} - try: - logging.info(connect_args) - self.conn = MySQLdb.connect(**connect_args) - except MySQLdb.OperationalError: - # Sometimes mysql wants to connect over this rather than a socket: - # falling back to it for backward-compatibility. - logging.debug("Connection failed: attempting fallback over a different port") - if connect_args["host"] == "localhost": - connect_args["host"] = "127.0.0.1" - self.conn = MySQLdb.connect(**connect_args) - else: - raise - - cursor = self.conn.cursor() - cursor.execute("CREATE DATABASE IF NOT EXISTS %s default character set utf8" % self.dbname) - # Don't use native query attribute here to avoid infinite loops - cursor.execute("SET NAMES 'utf8'") - cursor.execute("SET CHARACTER SET 'utf8'") - if setengine: - try: - cursor.execute("SET default_storage_engine=MYISAM") - except: - logging.error("Forcing default engine failed. On some versions of Mysql,\ - you may need to add \"default-storage-engine=MYISAM\" manually\ - to the [mysqld] user in /etc/my.cnf. Trying again to connect...") - self.connect(setengine=False) - logging.debug("Connecting to %s" % self.dbname) - cursor.execute("USE %s" % self.dbname) - - def query(self, sql, params = None, many_params=None): - """ - If a connection times out, reboot - the connection and starts up nicely again. - - many_params: If included, assume that executemany() is expected, with the sequence of parameter - provided. - """ - logging.debug(" -- Preparing to execute SQL code -- " + sql) - logging.debug(" -- with params {}".format(params)) - - try: - cursor = self.conn.cursor() - if many_params is not None: - cursor.executemany(sql, many_params) - else: - - cursor.execute(sql) - except: - try: - self.connect() - cursor = self.conn.cursor() - if many_params is not None: - cursor.executemany(sql, many_params) - else: - if params is None: - cursor.execute(sql) - else: - cursor.execute(sql, params) - except: - logging.error("Query failed: \n" + sql + "\n") - raise - - return cursor - -class BookwormSQLDatabase(object): - - """ - This class gives interactions methods to a MySQL database storing Bookworm - data. Although the primary methods are about loading data already created - into the SQL database, it has a few other operations - that write out text files needed by the API and the web front end: - I take it as logical to do those here, since that how - it fits chronologically in the bookworm-creation sequence. - """ - - def __init__(self, dbname=None, - variableFile=".bookworm/metadata/jsoncatalog_derived.txt"): - """ - You can initialize it with a database name; - otherwise it defaults to finding a - Bookworm configuration file. - """ - self.config_manager = Configfile("admin") - config = self.config_manager.config - - self.dbname = dbname - - self.conn = None - - if self.dbname is not None: - # Sometimes this may be called just to access the - # variables elements. - self.db = DB(dbname=self.dbname) - else: - self.db = None - - if variableFile is not None: - try: - self.setVariables(originFile=variableFile) - except FileNotFoundError: - pass - def grantPrivileges(self): - """ - Grants select-only privileges to a non-admin mysql user for the API to - query with without risking exposing write access to the Internet. - - The username for these privileges is usually just 'bookworm' without a password, - but if you place a file at '/etc/bookworm.cnf', it will be read from there. - """ - - globalfile = Configfile("read_only") - - username=globalfile.config.get("client","user") - password=globalfile.config.get("client","password") - try: - self.db.query("GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname,username,password)) - except MySQLdb._exceptions.OperationalError: - self.db.query("CREATE USER '%s'@'localhost' IDENTIFIED BY '%s'" % (username,password)) - self.db.query("GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname,username,password)) - - def setVariables(self, originFile, anchorField="bookid", - jsonDefinition=".bookworm/metadata/field_descriptions_derived.json"): - self.variableSet = variableSet(originFile=originFile, anchorField=anchorField, jsonDefinition=jsonDefinition,db=self.db) - - def importNewFile(self,originFile,anchorField,jsonDefinition): - """ - Add additional metadata from a source collection of json-formatted rows. - originFile is the filename of the new metadata, in the same input format - as the original jsoncatalog.txt - anchorField is the field in the existing dataset it should be anchored onto; - jsonDefinition is a filename pointing to a file - of the format of field_descriptions.json describing the new data to ingest. - If it is of type None, then one will be guessed at. - """ - self.setVariables(originFile,anchorField=anchorField,jsonDefinition=jsonDefinition) - self.variableSet.writeMetadata() - self.variableSet.loadMetadata() - self.variableSet.updateMasterVariableTable() - for variable in self.variableSet.variables: - variable.clear_associated_memory_tables() - #self.reloadMemoryTables() - - def create_database(self): - dbname = self.dbname - dbuser = self.dbuser - dbpassword = self.dbpassword - - db = self.db - - #This must be run as a MySQL user with create_table privileges - try: - db.query("CREATE DATABASE " + dbname) - except: - logging.info("Database %s already exists: that might be intentional, so not dying" % dbname) - - "Setting up permissions for web user..." - db.query("GRANT SELECT ON " + dbname + ".*" + " TO '" + dbuser + "'@'localhost' IDENTIFIED BY '" + dbpassword + "'") - db.query("GRANT SELECT ON {}.* TO 'bookworm'@'localhost'".format(dbname)) - db.query("FLUSH PRIVILEGES") - #a field to store stuff we might need later. - db.query("CREATE TABLE IF NOT EXISTS bookworm_information (entry VARCHAR(255), PRIMARY KEY (entry), value VARCHAR(50000))") - - def load_word_list(self): - db = self.db - logging.info("Making a SQL table to hold the words") - db.query("""DROP TABLE IF EXISTS words""") - db.query("""CREATE TABLE IF NOT EXISTS words ( - wordid MEDIUMINT UNSIGNED NOT NULL, - word VARCHAR(255), INDEX (word), - count BIGINT UNSIGNED, - casesens VARBINARY(255), - stem VARCHAR(255) - );""") - - db.query("ALTER TABLE words DISABLE KEYS") - logging.info("loading data using LOAD DATA LOCAL INFILE") - db.query("""LOAD DATA LOCAL INFILE '.bookworm/texts/wordlist/wordlist.txt' - INTO TABLE words - CHARACTER SET binary - (wordid,word,count) """) - logging.info("creating indexes on words table") - db.query("ALTER TABLE words ENABLE KEYS") - db.query("UPDATE words SET casesens=word") - - def load_book_list(self): - """ - Slated for deletion. - - Loads in the tables that have already been created by a previous - call to `Bookworm.variableSet.writeMetadata()` - """ - self.variableSet.loadMetadata() - - def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, reverse_index=True, table_count=1): - import time - t0 = time.time() - - db = self.db - ngramname = "unigrams" - tablenameroot = "master_bookcounts" - # If you are splitting the input into multiple tables - # to be joined as a merge table, come up with multiple - # table names and we'll cycle through. - if table_count == 1: - tablenames = [tablenameroot] - elif table_count > 1: - tablenames = ["%s_p%d" % (tablenameroot, i) for i in range(1, table_count+1)] - else: - logging.error("You need a positive integer for table_count") - raise - - grampath = ".bookworm/texts/encoded/%s" % ngramname - tmpdir = "%s/tmp" % grampath - - if (len(grampath) == 0) or (grampath == "/"): - logging.error("Woah! Don't set the ngram path to your system root!") - raise - - if newtable: - if os.path.exists(tmpdir): - import shutil - shutil.rmtree(tmpdir) - - logging.info("Dropping older %s table, if it exists" % ngramname) - for tablename in tablenames: - db.query("DROP TABLE IF EXISTS " + tablename) - - logging.info("Making a SQL table to hold the %s" % ngramname) - reverse_index_sql = "INDEX(bookid,wordid,count), " if reverse_index else "" - for tablename in tablenames: - db.query("CREATE TABLE IF NOT EXISTS " + tablename + " (" - "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + - "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " - "count MEDIUMINT UNSIGNED NOT NULL);") - - if ingest: - for tablename in tablenames: - db.query("ALTER TABLE " + tablename + " DISABLE KEYS") - db.query("set NAMES utf8;") - db.query("set CHARACTER SET utf8;") - logging.info("loading data using LOAD DATA LOCAL INFILE") - - files = os.listdir(grampath) - for i, filename in enumerate(files): - if filename.endswith('.txt'): - # With each input file, cycle through each table in tablenames - tablename = tablenames[i % len(tablenames)] - logging.debug("Importing txt file, %s (%d/%d)" % (filename, i, len(files))) - try: - db.query("LOAD DATA LOCAL INFILE '" + grampath + "/" + filename + "' INTO TABLE " + tablename +" CHARACTER SET utf8 (bookid,wordid,count);") - except KeyboardInterrupt: - raise - except: - logging.debug("Falling back on insert without LOCAL DATA INFILE. Slower.") - try: - import pandas as pd - df = pd.read_csv(grampath + "/" + filename, sep='\t', header=None) - to_insert = df.apply(tuple, axis=1).tolist() - db.query( - "INSERT INTO " + tablename + " (bookid,wordid,count) " - "VALUES (%s, %s, %s);""", - many_params=to_insert - ) - except KeyboardInterrupt: - raise - except: - logging.exception("Error inserting %s from %s" % (ngramname, filename)) - continue - - elif filename.endswith('.h5'): - logging.info("Importing h5 file, %s (%d/%d)" % (filename, i, len(files))) - try: - # When encountering an .h5 file, this looks for ngram information - # in a /#{ngramnames} table (e.g. /unigrams) and writes it out to - # temporary TSV files. - # Dask is used here simply because it's a dead simple way to multithread - # the TSV writing and lower the overhead versus having a TSV already staged. - import csv - import pandas as pd - try: - import dask.dataframe as dd - except: - logging.exception("Ingesting h5 files requires dask") - try: - os.makedirs(tmpdir) - except OSError: - if not os.path.isdir(tmpdir): - raise - # Dask will use #{n_cores-1} threads when saving CSVs. - # Ingest and key reload times are identical to txt import, so the only - # additional overhead is reading the file (small effect) and writing the csv. - ddf = dd.read_hdf(grampath + "/" + filename, - ngramname, mode='r', chunksize=2000000) - ddf.reset_index().to_csv(tmpdir + '/tmp.*.tsv', - index=False, sep='\t', header=False, - quoting=csv.QUOTE_NONNUMERIC) - logging.info("CSV written from H5. Time passed: %.2f s" % (time.time() - t0)) - for j, tmpfile in enumerate(os.listdir(tmpdir)): - # With each input file, cycle through each table in tablenames - tablename = tablenames[j % len(tablenames)] - path = "%s/%s" % (tmpdir, tmpfile) - db.query("LOAD DATA LOCAL INFILE '" + path + "' " - "INTO TABLE " + tablename + " " - "CHARACTER SET utf8 (bookid,wordid,count);") - try: - os.remove(path) - except: - pass - logging.info("CSVs input. Time passed: %.2f s" % (time.time() - t0)) - except KeyboardInterrupt: - raise - except: - logging.exception("Error inserting %s from %s" % (ngramname, filename)) - continue - else: - continue - if index: - logging.info("Creating Unigram Indexes. Time passed: %.2f s" % (time.time() - t0)) - for tablename in tablenames: - db.query("ALTER TABLE " + tablename + " ENABLE KEYS") - - if table_count > 1: - logging.info("Creating a merge table for " + ",".join(tablenames)) - db.query("CREATE TABLE IF NOT EXISTS " + tablenameroot + " (" - "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + - "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " - "count MEDIUMINT UNSIGNED NOT NULL) " - "ENGINE=MERGE UNION=(" + ",".join(tablenames) + ") INSERT_METHOD=LAST;") - - logging.info("Unigram index created in: %.2f s" % ((time.time() - t0))) - - def create_bigram_book_counts(self): - db = self.db - logging.info("Making a SQL table to hold the bigram counts") - db.query("""DROP TABLE IF EXISTS master_bigrams""") - db.query("""CREATE TABLE master_bigrams ( - bookid MEDIUMINT UNSIGNED NOT NULL, - word1 MEDIUMINT UNSIGNED NOT NULL, INDEX (word1,word2,bookid,count), - word2 MEDIUMINT UNSIGNED NOT NULL, - count MEDIUMINT UNSIGNED NOT NULL);""") - db.query("ALTER TABLE master_bigrams DISABLE KEYS") - logging.info("loading data using LOAD DATA LOCAL INFILE") - for filename in os.listdir(".bookworm/texts/encoded/bigrams"): - db.query("LOAD DATA LOCAL INFILE '.bookworm/texts/encoded/bigrams/"+filename+"' INTO TABLE master_bigrams CHARACTER SET utf8 (bookid,word1,word2,count);") - - logging.info("Creating bigram indexes") - db.query("ALTER TABLE master_bigrams ENABLE KEYS") - - def loadVariableDescriptionsIntoDatabase(self): - """ - This adds a description of files to the master variable table: - also, crucially, it puts code specifying their fast creation there, - where it will be executed on startup for all eternity. - """ - logging.debug("Building masterVariableTable") - db = self.db - db.query("DROP TABLE IF EXISTS masterVariableTable") - m = db.query(""" - CREATE TABLE IF NOT EXISTS masterVariableTable - (dbname VARCHAR(255), PRIMARY KEY (dbname), - name VARCHAR(255), - type VARCHAR(255), - tablename VARCHAR(255), - anchor VARCHAR(255), - alias VARCHAR(255), - status VARCHAR(255), - description VARCHAR(5000) - ) ENGINE=MYISAM; - """) - tableTable = db.query(""" - CREATE TABLE IF NOT EXISTS masterTableTable - (tablename VARCHAR(255), PRIMARY KEY (tablename), - dependsOn VARCHAR(255), - memoryCode VARCHAR(20000)) ENGINE=MYISAM; - """) - self.addFilesToMasterVariableTable() - self.addWordsToMasterVariableTable() - self.variableSet.updateMasterVariableTable() - - def reloadMemoryTables(self, force=False, names = None): - - """ - Checks to see if memory tables need to be repopulated (by seeing if they are empty) - and then does so if necessary. - - If an array is passed to 'names', only the specified tables will be - loaded into memory; otherwise, all will. - """ - - q = "SELECT tablename,memoryCode FROM masterTableTable" - existingCreateCodes = self.db.query(q).fetchall() - - if names is not None: - existingCreateCodes = [e for e in existingCreateCodes if e[0] in names] - - for row in existingCreateCodes: - """ - For each table, it checks to see if the table is currently populated; if not, - it runs the stored code to repopulate the table. (It checks length because - memory tables are emptied on a restart). - """ - tablename = row[0] - try: - cursor = self.db.query("SELECT count(*) FROM %s" %(tablename)) - currentLength = cursor.fetchall()[0][0] - logging.debug("Current Length is %d" %currentLength) - except: - currentLength = 0 - if currentLength==0 or force: - for query in splitMySQLcode(row[1]): - self.db.query("SET optimizer_search_depth=0") - self.db.query(query) - - - def fastcat_creation_SQL(self, engine="MEMORY"): - """ - Generate SQL to create the fastcat (memory) and fastcat_ (on-disk) tables. - """ - - tbname = "fastcat" - if engine=="MYISAM": - tbname = "fastcat_" - - fastFieldsCreateList = [ - "bookid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (bookid)", - "nwords MEDIUMINT UNSIGNED NOT NULL" - ] - - fastFieldsCreateList += [variable.fastSQL() for variable in self.variableSet.uniques("fast")] - - create_command = """DROP TABLE IF EXISTS tmp;""" - create_command += "CREATE TABLE tmp ({}) ENGINE={};""".format( - ", ".join(fastFieldsCreateList), engine) - - if engine == "MYISAM": - fastFields = ["bookid","nwords"] + [variable.fastField for variable in self.variableSet.uniques("fast")] - load_command = "INSERT INTO tmp SELECT " - load_command += ",".join(fastFields) + " FROM catalog USE INDEX () " - # LEFT JOIN fixes a bug where fields were being dropped - load_command += " ".join(["LEFT JOIN %(field)s__id USING (%(field)s ) " % variable.__dict__ for variable in self.variableSet.uniques("categorical")]) + ";" - elif engine == "MEMORY": - load_command = "INSERT INTO tmp SELECT * FROM fastcat_;" - - cleanup_command = "DROP TABLE IF EXISTS {};".format(tbname) - cleanup_command += "RENAME TABLE tmp TO {};".format(tbname) - return create_command + load_command + cleanup_command; - - def create_fastcat_and_wordsheap_disk_tables(self): - for q in self.fastcat_creation_SQL("MYISAM").split(";"): - if q != "": - self.db.query(q) - for q in self.wordsheap_creation_SQL("MYISAM").split(";"): - if q != "": - self.db.query(q) - - def addFilesToMasterVariableTable(self): - #Also update the wordcounts for each text. - code = self.fastcat_creation_SQL("MEMORY") - self.db.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="fastcat";') - self.db.query("""INSERT IGNORE INTO masterTableTable VALUES - ('fastcat','fastcat','{}')""".format(code)) - - - def wordsheap_creation_SQL(self,engine="MEMORY",max_word_length=30,max_words = 1500000): - tbname = "wordsheap" - if engine=="MYISAM": - tbname = "wordsheap_" - wordCommand = "DROP TABLE IF EXISTS tmp;" - wordCommand += "CREATE TABLE tmp (wordid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (wordid), word VARCHAR(30), INDEX (word), casesens VARBINARY(30),UNIQUE INDEX(casesens), lowercase CHAR(30), INDEX (lowercase) ) ENGINE={};".format(engine) - if engine=="MYISAM": - wordCommand += "INSERT IGNORE INTO tmp SELECT wordid as wordid,word,casesens,LOWER(word) FROM words WHERE CHAR_LENGTH(word) <= {} AND wordid <= {} ORDER BY wordid;".format(max_word_length,max_words) - else: - wordCommand += "INSERT IGNORE INTO tmp SELECT * FROM wordsheap_;" - wordCommand += "DROP TABLE IF EXISTS {};".format(tbname) - wordCommand += "RENAME TABLE tmp TO {};".format(tbname) - return wordCommand - - def addWordsToMasterVariableTable(self, max_word_length = 30, max_words = 1500000): - """ - - """ - wordCommand = self.wordsheap_creation_SQL("MEMORY",max_word_length,max_words) - query = "INSERT IGNORE INTO masterTableTable " - query += "VALUES ('wordsheap','wordsheap','{}'); ".format(wordCommand) - logging.info("Creating wordsheap") - self.db.query(query) - - def jsonify_data(self): - variables = self.variableSet.variables - dbname = self.dbname - #This creates a JSON file compliant with the Bookworm web site. - #Deprecated. - output = dict() - output['settings'] = { - "dbname": self.dbname, - "itemName":" text", - "sourceName": self.dbname, - "sourceURL": self.dbname - } - ui_components = [ - { - "type":"text", - "dbfield":"word", - "name":"Word(s)" - } - ] - for variable in variables: - newdict = variable.jsonDict() - if newdict: #(It can be empty, in which case we don't want it for the json) - ui_components.append(newdict) - try: - mytime = [variable.field for variable in variables if variable.datatype=='time'][0] - output['default_search'] = [ - { - "search_limits": [{"word":["test"]}], - "time_measure": mytime, - "words_collation": "Case_Sensitive", - "counttype": "Occurrences_per_Million_Words", - "smoothingSpan": 0 - } - ] - except: - logging.warning("No default search created because of insufficient data.") - output['ui_components'] = ui_components - - with open('.bookworm/%s.json' % dbname, 'w') as outfile: - outfile.write(json.dumps(output)) - - def create_API_settings(self): - db = self.db - try: - db.query("DROP TABLE IF EXISTS API_settings") - db.query("CREATE TABLE API_settings (settings VARCHAR(8192));") - except: - pass - api_info = { - "HOST": "10.102.15.45", - "database": self.dbname, - "read_default_file": "/etc/mysql/my.cnf", - } - addCode = json.dumps(api_info) - logging.info(addCode) - db.query("INSERT INTO API_settings VALUES ('%s');" % addCode) - - def update_Porter_stemming(self): #We use stems occasionally. - """ - Still not executed. - """ - logging.info("Updating stems from Porter algorithm...") - from nltk import PorterStemmer - db = self.db - - stemmer = PorterStemmer() - cursor = db.query("""SELECT word FROM words""") - words = cursor.fetchall() - for local in words: - word = ''.join(local) # Could probably take the first element of the tuple as well? - # Apostrophes have the save stem as the word, if they're included - word = word.replace("'s","") - if re.match("^[A-Za-z]+$",word): - query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" - z = cursor.execute(query) diff --git a/bookwormDB/DuckSchema.py b/bookwormDB/DuckSchema.py new file mode 100644 index 0000000..d1ae79d --- /dev/null +++ b/bookwormDB/DuckSchema.py @@ -0,0 +1,136 @@ +import pyarrow as pa +from base64 import b64decode +import logging +import pandas as pd +logger = logging.getLogger("bookworm") + +class DuckSchema(object): + """ + This class stores information about the database setup that is used to + optimize query creation query + and so that queries know what tables to include. + It's broken off like this because it might be usefully wrapped around some of + the backend features, + because it shouldn't be run multiple times in a single query + (that spawns two instances of itself), + as was happening before. + """ + + def __init__(self, db): + # XXXX + self.db = db + self._records = None + # hash of what table each variable is in + self.tableToLookIn = { + '_ncid': 'fastcat', + '@id': "slowcat", + 'wordid': "wordsheap", + 'nwords': 'fastcat'} + + # hash of what the root variable for each search term is (eg, + # 'author_birth' might be crosswalked to 'authorid' in the + # main catalog.) + self.anchorFields = { + '_ncid': '_ncid', + '@id': "slowcat", + 'wordid': "wordid", + 'word': "wordid", + 'nwords': '_ncid' + } + + # aliases: a hash showing internal identifications codes that + # dramatically speed up query time, but which shouldn't be + # exposed. So you can run a search for "state," say, and the + # database will group on a 50-element integer code instead of + # a VARCHAR that has to be long enough to support + # "Massachusetts" and "North Carolina." A couple are + # hard-coded in, but most are derived by looking for fields + # that end in the suffix "__id" later. + + # The aliases starts with a dummy alias for fully grouped queries. + self.aliases = {} + + tables = db.execute("SELECT name, schema FROM arrow_schemas WHERE type='table'").fetchall() + schema = dict(tables) + + current_anchor = None + self.fields = [] + for tablename, tab in schema.items(): + sch = pa.ipc.read_schema(pa.py_buffer(b64decode(tab))) + if tablename in ["catalog"]: + continue + for i, field in enumerate(sch): + self.fields.append(field) + if i == 0: + current_anchor = field.name + else: + self.tableToLookIn[field.name] = tablename + self.anchorFields[field.name] = current_anchor + if current_anchor.endswith("__id"): + self.aliases[field.name] = current_anchor + tables = db.execute("SELECT name, schema FROM arrow_schemas WHERE type='table'").fetchall() + # A few columns are kept in the 'slowcat' view for historical reasons. + slowcols = set(db.execute("DESCRIBE TABLE slowcat").df()['Field']) + current_anchor = "_ncid" + for i, field in enumerate(slowcols): + if i > 0: + self.tableToLookIn[field] = "slowcat" + self.anchorFields[field] = "_ncid" + + @property + def records(self): + """ + Return a JSON representation of the schema. + """ + if self._records is not None: + return self._records + fields = {} + for field in self.fields: + name = field.name + if name.endswith("__id"): + continue + elif name in { 'count', 'wordid', '_ncid' }: + continue + elif str(field.type) == 'old_string': + continue + else: + fields[name] = {'dbname': name, 'dtype': str(field.type)} + if field.metadata: + for k, v in field.metadata.items(): + fields[name][k.decode('utf-8')] = v.decode('utf-8') + + self._records = fields + return fields + + def to_pandas(self): + return pd.DataFrame([*self.records.values()]) + + def tables_for_variable(self, variable, depth = 0): + """ + Returns the tables needed to look up a variable, back up to 'fastcat' or 'wordsheap' + """ + if variable == '_ncid' or variable == 'wordid' or (variable.startswith("word") and len(variable) == 5): + return [] + vals = [] + try: + tabs = [ + (depth, self.tableToLookIn[variable]), + *self.tables_for_variable(self.anchorFields[variable], depth - 1) + ] + except KeyError: + print("\n\n", variable, "\n\n") + print(self.anchorFields) + print("\n\n", self.tableToLookIn, "\n\n") + raise + return tabs + + def tables_for_variables(self, variables): + lookups = [] + for variable in variables: + lookups = lookups + self.tables_for_variable(variable) + lookups.sort() + tables = [] + for depth, tablename in lookups: + if not tablename in tables: + tables.append(tablename) + return tables \ No newline at end of file diff --git a/bookwormDB/MetaParser.py b/bookwormDB/MetaParser.py deleted file mode 100644 index d3d20dc..0000000 --- a/bookwormDB/MetaParser.py +++ /dev/null @@ -1,269 +0,0 @@ -from __future__ import division -from datetime import date -import datetime -import dateutil.parser -import json -import sys -import os -import logging -from multiprocessing import Queue, Process -from queue import Empty -from .multiprocessingHelp import mp_stats, running_processes -import time - - -defaultDate = datetime.datetime(datetime.MINYEAR, 1, 1) - -def DaysSinceZero(dateobj): - #Zero isn't a date, which python knows but MySQL and javascript don't. - return (dateobj - date(1,1,1)).days + 366 - -def ParseFieldDescs(write = False): - f = open('field_descriptions.json', 'r') - try: - fields = json.loads(f.read()) - except ValueError: - raise ValueError("Error parsing JSON: Check to make sure that your field_descriptions.json file is valid?") - - - if write: - derivedFile = open('.bookworm/metadata/field_descriptions_derived.json', 'w') - - output = [] - - fields_to_derive = [] - - for field in fields: - if field["datatype"] == "time": - if "derived" in field: - fields_to_derive.append(field) - else: - output.append(field) - else: - output.append(field) - - for field in fields_to_derive: - for derive in field["derived"]: - if "aggregate" in derive: - tmp = dict(datatype="time", type="integer", unique=True) - tmp["field"] = '_'.join([field["field"], derive["resolution"], - derive["aggregate"]]) - output.append(tmp) - else: - tmp = dict(datatype="time", type="integer", unique=True) - tmp["field"] = '_'.join([field["field"], derive["resolution"]]) - output.append(tmp) - if write: - derivedFile.write(json.dumps(output)) - derivedFile.close() - - return (fields_to_derive, fields) - -def parse_json_catalog(line_queue, processes, modulo): - fields_to_derive, fields = ParseFieldDescs(write = False) - - if os.path.exists("jsoncatalog.txt"): - mode = "json" - fin = open("jsoncatalog.txt") - - if os.path.exists("catalog.csv"): - mode = "csv" - import csv - fin = csv.DictReader("catalog.csv") - - for i, line in enumerate(fin): - if i % processes != modulo: - continue - - for char in ['\t', '\n']: - line = line.replace(char, '') - - if mode == "json": - try: - line = json.loads(line) - except: - logging.warn("Couldn't parse catalog line {}".format(line)) - continue - - for field in fields: - # Smash together misidentified lists - try: - if field['unique'] and isinstance(line[field["field"]],list): - line[field["field"]] = "--".join(line[field["field"]]) - except KeyError: - pass - - for field in fields_to_derive: - - """ - Using fields_to_derive as a shorthand for dates--this may break - if we get more ambitious about derived fields, - but this whole metadata-parsing code needs to be refactored anyway. - - Note: this code is inefficient--it parses the same date multiple times. - We should be parsing the date once and pulling - derived fields out of that one parsing. - """ - - try: - if line[field["field"]]=="": - # Use blankness as a proxy for unknown - continue - - time = dateutil.parser.parse(line[field["field"]],default = defaultDate) - intent = [time.year,time.month,time.day] - content = [str(item) for item in intent] - - pass - except: - """ - Fall back to parsing as strings - """ - try: - datem = line[field["field"]].split("T")[0] - content = datem.split('-') - intent = [int(item) for item in content] - except KeyError: - #It's OK not to have an entry for a time field - continue - except ValueError: - # Thrown if fields are empty on taking the int value: treat as junk - continue - except AttributeError: - """ - Happens if it's an integer, which is a forgiveable way - to enter a year: - """ - content = [str(line[field['field']])] - intent = [line[field['field']]] - else: - for derive in field["derived"]: - try: - if "aggregate" in derive: - if derive["resolution"] == 'day' and \ - derive["aggregate"] == "year": - k = "%s_day_year" % field["field"] - dt = date(intent[0], intent[1], intent[2]) - line[k] = dt.timetuple().tm_yday - elif derive["resolution"] == 'day' and \ - derive["aggregate"] == "month": - k = "%s_day_month" % field["field"] - line[k] = intent[2] - elif derive["resolution"] == 'day' and \ - derive["aggregate"] == "week": - k = "%s_day_month" % field["field"] - dt = date(intent[0], intent[1], intent[2]) - # Python and javascript handle weekdays differently: - # Like JS, we want to begin on Sunday with zero - line[k] = dt.weekday() + 1 - if (line[k]) == 7: - line[k] = 0 - elif derive["resolution"] == 'month' and \ - derive["aggregate"] == "year": - k = "%s_month_year" % field["field"] - dt = date(1,intent[1],1) - line[k] = dt.timetuple().tm_yday - elif derive["resolution"] == 'week' and \ - derive["aggregate"] == "year": - dt = date(intent[0], intent[1], intent[2]) - k = "%s_week_year" % field["field"] - line[k] = int(dt.timetuple().tm_yday/7)*7 - elif derive["resolution"] == 'hour' and \ - derive["aggregate"] == "day": - k = "%s_hour_day" % field["field"] - line[k] = time.hour - elif derive["resolution"] == 'minute' and \ - derive["aggregate"] == "day": - k = "%s_hour_day" % field["field"] - line[k] = time.hour*60 + time.minute - else: - logging.warning('Problem with aggregate resolution.') - continue - else: - if derive["resolution"] == 'year': - line["%s_year" % field["field"]] = intent[0] - elif derive["resolution"] == 'month': - try: - k = "%s_month" % field["field"] - dt = date(intent[0], intent[1], 1) - line[k] = DaysSinceZero(dt) - except: - logging.warning("Problem with date fields\n") - pass - elif derive['resolution'] == 'week': - k = "%s_week" % field['field'] - dt = date(intent[0], intent[1], intent[2]) - inttime = DaysSinceZero(dt) - time = int(inttime/7)*7 - #Not starting on Sunday or anything funky like that. Actually, I don't know what we're starting on. Adding an integer here would fix that. - line[k] = time - elif derive['resolution'] == 'day': - k = "%s_day" % field['field'] - dt = date(intent[0], intent[1], intent[2]) - inttime = DaysSinceZero(dt) - line[k] = inttime - else: - logging.warning('Resolution %s currently not supported.' % (derive['resolution'])) - continue - except ValueError: - # One of out a million Times articles threw this with - # a year of like 111,203. It's not clear how best to - # handle this. - logging.warning("ERROR: %s " % line[field["field"]] + - "did not convert to proper date. Moving on...") - # raise - pass - except Exception as e: - logging.warning('*'*50) - logging.warning('ERROR: %s\nINFO: %s\n' % (str(e), e.__doc__)) - logging.warning('*'*50) - line.pop(field["field"]) - try: - el = json.dumps(line) - line_queue.put((line["filename"], el)) - except KeyError: - logging.warning("No filename key in {}".format(line)) - except: - logging.warning("Error on {}".format(line)) - raise - logging.debug("Metadata thread done after {} lines".format(i)) - - -def parse_catalog_multicore(): - from .sqliteKV import KV - cpus, _ = mp_stats() - encoded_queue = Queue(10000) - workers = [] - - for i in range(cpus): - p = Process(target = parse_json_catalog, args = (encoded_queue, cpus, i)) - p.start() - workers.append(p) - output = open(".bookworm/metadata/jsoncatalog_derived.txt", "w") - - bookids = KV(".bookworm/metadata/textids.sqlite") - import sqlite3 - - while True: - try: - filename, n = encoded_queue.get_nowait() - output.write(n + "\n") - ids = set() - try: - bookids.register(filename) - except sqlite3.IntegrityError: - if filename in ids: - logging.warning("Duplicate key insertion {}".format(filename)) - ids.add(filename) - - except Empty: - if running_processes(workers): - # Give it a sec to fill back up to avoid this thread taking up - # a full processor. - time.sleep(0.01) - else: - # We're done! - break - - bookids.close() - output.close() diff --git a/bookwormDB/SQLAPI.py b/bookwormDB/SQLAPI.py deleted file mode 100644 index 33bdf70..0000000 --- a/bookwormDB/SQLAPI.py +++ /dev/null @@ -1,1173 +0,0 @@ -#!/usr/local/bin/python - - -from .variableSet import to_unicode -import json -import re -import copy -import MySQLdb -import hashlib -import logging -from .bwExceptions import BookwormException - -# If you have bookworms stored on a different host, you can create more lines -# like this. -# A different host and read_default_file will let you import things onto a -# different server. -general_prefs = dict() -general_prefs["default"] = {"fastcat": "fastcat", - "fastword": "wordsheap", - "fullcat": "catalog", - "fullword": "words", - "read_default_file": "/etc/mysql/my.cnf" -} - -class DbConnect(object): - # This is a read-only account - def __init__(self, prefs=general_prefs['default'], database=None, - host=None): - - self.dbname = database - - import bookwormDB.configuration - conf = bookwormDB.configuration.Configfile("read_only").config - - if database is None: - database = prefs['database'] - - connargs = { - "db": database, - "use_unicode": 'True', - "charset": 'utf8', - "user": conf.get("client", "user"), - "password": conf.get("client", "password") - } - - if host: - connargs['host'] = host - # For back-compatibility: - elif "HOST" in prefs: - connargs['host'] = prefs['HOST'] - else: - host = "localhost" - - try: - self.db = MySQLdb.connect(**connargs) - except: - try: - # Sometimes mysql wants to connect over this rather than a socket: - # falling back to it for backward-compatibility. - connargs["host"] = "127.0.0.1" - self.db = MySQLdb.connect(**connargs) - except: - raise - - self.cursor = self.db.cursor() - -def fail_if_nonword_characters_in_columns(input): - keys = all_keys(input) - for key in keys: - if re.search(r"[^A-Za-z_$*0-9]", key): - logging.error("{} has nonword character".format(key)) - raise - - -def all_keys(input): - """ - Recursive function. Get every keyname in every descendant of a dictionary. - Iterates down on list and dict structures to search for more dicts with - keys. - """ - values = [] - if isinstance(input, dict): - values = list(input.keys()) - for key in list(input.keys()): - values = values + all_keys(input[key]) - if isinstance(input, list): - for value in input: - valleys = all_keys(value) - for val in valleys: - values.append(val) - return values - -# The basic object here is a 'userquery:' it takes dictionary as input, -# as defined in the API, and returns a value -# via the 'execute' function whose behavior -# depends on the mode that is passed to it. -# Given the dictionary, it can return a number of objects. -# The "Search_limits" array in the passed dictionary determines how many -# elements it returns; this lets multiple queries be bundled together. -# Most functions describe a subquery that might be combined into one big query -# in various ways. - -class userquery(object): - """ - The base class for a bookworm search. - """ - def __init__(self, outside_dictionary = {}, db = None, databaseScheme = None): - # Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it. - fail_if_nonword_characters_in_columns(outside_dictionary) - try: - self.prefs = general_prefs[outside_dictionary['database']] - except KeyError: - # If it's not in the option, use some default preferences and search on localhost. This will work in most cases here on out. - self.prefs = general_prefs['default'] - self.prefs['database'] = outside_dictionary['database'] - self.outside_dictionary = outside_dictionary - # self.prefs = general_prefs[outside_dictionary.setdefault('database', 'presidio')] - self.db = db - if db is None: - self.db = DbConnect(self.prefs) - self.databaseScheme = databaseScheme - if databaseScheme is None: - self.databaseScheme = databaseSchema(self.db) - - self.cursor = self.db.cursor - self.wordsheap = self.fallback_table(self.prefs['fastword']) - - self.words = self.prefs['fullword'] - """ - I'm now allowing 'search_limits' to either be a dictionary or an array of dictionaries: - this makes the syntax cleaner on most queries, - while still allowing some long ones from the Bookworm website. - """ - try: - if isinstance(outside_dictionary['search_limits'], list): - outside_dictionary['search_limits'] = outside_dictionary['search_limits'][0] - except: - outside_dictionary['search_limits'] = dict() - # outside_dictionary = self.limitCategoricalQueries(outside_dictionary) - self.defaults(outside_dictionary) # Take some defaults - self.derive_variables() # Derive some useful variables that the query will use. - - def defaults(self, outside_dictionary): - # these are default values;these are the only values that can be set in the query - # search_limits is an array of dictionaries; - # each one contains a set of limits that are mutually independent - # The other limitations are universal for all the search limits being set. - - # Set up a dictionary for the denominator of any fraction if it doesn't already exist: - self.search_limits = outside_dictionary.setdefault('search_limits', [{"word":["polka dot"]}]) - self.words_collation = outside_dictionary.setdefault('words_collation', "Case_Insensitive") - - lookups = {"Case_Insensitive":'word', 'lowercase':'lowercase', 'casesens':'casesens', "case_insensitive":"word", "Case_Sensitive":"casesens", "All_Words_with_Same_Stem":"stem", 'stem':'stem'} - self.word_field = lookups[self.words_collation] - - self.time_limits = outside_dictionary.setdefault('time_limits', [0, 10000000]) - self.time_measure = outside_dictionary.setdefault('time_measure', 'year') - - self.groups = set() - self.outerGroups = [] # [] # Only used on the final join; directionality matters, unlike for the other ones. - self.finalMergeTables=set() - try: - groups = outside_dictionary['groups'] - except: - groups = [outside_dictionary['time_measure']] - - if groups == [] or groups == ["unigram"]: - # Set an arbitrary column name that will always be true if nothing else is set. - groups.insert(0, "1 as In_Library") - - if (len(groups) > 1): - pass - # self.groups = credentialCheckandClean(self.groups) - # Define some sort of limitations here, if not done in dbbindings.py - - for group in groups: - - # There's a special set of rules for how to handle unigram and bigrams - multigramSearch = re.match("(unigram|bigram|trigram)(\d)?", group) - - if multigramSearch: - if group == "unigram": - gramPos = "1" - gramType = "unigram" - - else: - gramType = multigramSearch.groups()[0] - try: - gramPos = multigramSearch.groups()[1] - except: - print("currently you must specify which bigram element you want (eg, 'bigram1')") - raise - - lookupTableName = "%sLookup%s" %(gramType, gramPos) - self.outerGroups.append("%s.%s as %s" %(lookupTableName, self.word_field, group)) - self.finalMergeTables.add(" JOIN %s as %s ON %s.wordid=w%s" %(self.wordsheap, lookupTableName, lookupTableName, gramPos)) - self.groups.add("words%s.wordid as w%s" %(gramPos, gramPos)) - - else: - self.outerGroups.append(group) - try: - if self.databaseScheme.aliases[group] != group: - # Search on the ID field, not the basic field. - # debug(self.databaseScheme.aliases.keys()) - self.groups.add(self.databaseScheme.aliases[group]) - table = self.databaseScheme.tableToLookIn[group] - - joinfield = self.databaseScheme.aliases[group] - self.finalMergeTables.add(" JOIN " + table + " USING (" + joinfield + ") ") - else: - self.groups.add(group) - except KeyError: - self.groups.add(group) - - """ - There are the selections which can include table refs, and the groupings, which may not: - and the final suffix to enable fast lookup - """ - - self.selections = ",".join(self.groups) - self.groupings = ",".join([re.sub(".* as", "", group) for group in self.groups]) - - self.joinSuffix = "" + " ".join(self.finalMergeTables) - - """ - Define the comparison set if a comparison is being done. - """ - # Deprecated--tagged for deletion - # self.determineOutsideDictionary() - - # This is a little tricky behavior here--hopefully it works in all cases. It drops out word groupings. - - self.counttype = outside_dictionary.setdefault('counttype', ["WordCount"]) - - if isinstance(self.counttype, (str, bytes)): - self.counttype = [self.counttype] - - # index is deprecated, but the old version uses it. - self.index = outside_dictionary.setdefault('index', 0) - """ - # Ordinarily, the input should be an an array of groups that will both select and group by. - # The joins may be screwed up by certain names that exist in multiple tables, so there's an option to do something like - # SELECT catalog.bookid as myid, because WHERE clauses on myid will work but GROUP BY clauses on catalog.bookid may not - # after a sufficiently large number of subqueries. - # This smoothing code really ought to go somewhere else, since it doesn't quite fit into the whole API mentality and is - # more about the webpage. It is only included here as a stopgap: NO FURTHER APPLICATIONS USING IT SHOULD BE BUILT. - """ - - self.smoothingType = outside_dictionary.setdefault('smoothingType', "triangle") - self.smoothingSpan = outside_dictionary.setdefault('smoothingSpan', 3) - self.method = outside_dictionary.setdefault('method', "Nothing") - - def determineOutsideDictionary(self): - """ - deprecated--tagged for deletion. - """ - self.compare_dictionary = copy.deepcopy(self.outside_dictionary) - if 'compare_limits' in list(self.outside_dictionary.keys()): - self.compare_dictionary['search_limits'] = self.outside_dictionary['compare_limits'] - del self.outside_dictionary['compare_limits'] - elif sum([bool(re.search(r'\*', string)) for string in list(self.outside_dictionary['search_limits'].keys())]) > 0: - # If any keys have stars at the end, drop them from the compare set - # This is often a _very_ helpful definition for succinct comparison queries of many types. - # The cost is that an asterisk doesn't allow you - - for key in list(self.outside_dictionary['search_limits'].keys()): - if re.search(r'\*', key): - # rename the main one to not have a star - self.outside_dictionary['search_limits'][re.sub(r'\*', '', key)] = self.outside_dictionary['search_limits'][key] - # drop it from the compare_limits and delete the version in the search_limits with a star - del self.outside_dictionary['search_limits'][key] - del self.compare_dictionary['search_limits'][key] - else: # if nothing specified, we compare the word to the corpus. - deleted = False - for key in list(self.outside_dictionary['search_limits'].keys()): - if re.search('words?\d', key) or re.search('gram$', key) or re.match(r'word', key): - del self.compare_dictionary['search_limits'][key] - deleted = True - if not deleted: - # If there are no words keys, just delete the first key of any type. - # Sort order can't be assumed, but this is a useful failure mechanism of last resort. Maybe. - try: - del self.compare_dictionary['search_limits'][list(self.outside_dictionary['search_limits'].keys())[0]] - except: - pass - """ - The grouping behavior here is not desirable, but I'm not quite sure how yet. - Aha--one way is that it accidentally drops out a bunch of options. I'm just disabling it: let's see what goes wrong now. - """ - try: - pass# self.compare_dictionary['groups'] = [group for group in self.compare_dictionary['groups'] if not re.match('word', group) and not re.match("[u]?[bn]igram", group)]# topicfix? and not re.match("topic", group)] - except: - self.compare_dictionary['groups'] = [self.compare_dictionary['time_measure']] - - def derive_variables(self): - # These are locally useful, and depend on the search limits put in. - self.limits = self.search_limits - # Treat empty constraints as nothing at all, not as full restrictions. - for key in list(self.limits.keys()): - if self.limits[key] == []: - del self.limits[key] - self.set_operations() - self.create_catalog_table() - self.make_catwhere() - self.make_wordwheres() - - def tablesNeededForQuery(self, fieldNames=[]): - db = self.db - neededTables = set() - tablenames = dict() - tableDepends = dict() - db.cursor.execute("SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);") - for row in db.cursor.fetchall(): - tablenames[row[0]] = row[2] - tableDepends[row[2]] = row[3] - - for fieldname in fieldNames: - parent = "" - try: - current = tablenames[fieldname] - neededTables.add(current) - n = 1 - while parent not in ['fastcat', 'wordsheap']: - parent = tableDepends[current] - neededTables.add(parent) - current = parent - n+=1 - if n > 100: - raise TypeError("Unable to handle this; seems like a recursion loop in the table definitions.") - # This will add 'fastcat' or 'wordsheap' exactly once per entry - except KeyError: - pass - - return neededTables - - def needed_columns(self): - """ - Given a query, what are the columns that the compiled search will need materialized? - - Important for joining appropriate tables to the search. - - Needs a recursive function so it will find keys deeply nested inside "$or" searches. - """ - cols = [] - def pull_keys(entry): - val = [] - if isinstance(entry,list) and not isinstance(entry,(str, bytes)): - for element in entry: - val += pull_keys(element) - elif isinstance(entry,dict): - for k,v in entry.items(): - if k[0] != "$": - val.append(k) - else: - val += pull_keys(v) - else: - return [] - return [re.sub(" .*","",key) for key in val] - - return pull_keys(self.limits) + [re.sub(" .*","",g) for g in self.groups] - - - def create_catalog_table(self): - self.catalog = self.prefs['fastcat'] # 'catalog' # Can be replaced with a more complicated query in the event of longer joins. - - """ - This should check query constraints against a list of tables, and join to them. - So if you query with a limit on LCSH, and LCSH is listed as being in a separate table, - it joins the table "LCSH" to catalog; and then that table has one column, ALSO - called "LCSH", which is matched against. This allows a bookid to be a member of multiple catalogs. - """ - - self.relevantTables = set() - - databaseScheme = self.databaseScheme - columns = [] - for columnInQuery in self.needed_columns(): - columns.append(columnInQuery) - try: - self.relevantTables.add(databaseScheme.tableToLookIn[columnInQuery]) - try: - self.relevantTables.add(databaseScheme.tableToLookIn[databaseScheme.anchorFields[columnInQuery]]) - try: - self.relevantTables.add(databaseScheme.tableToLookIn[databaseScheme.anchorFields[databaseScheme.anchorFields[columnInQuery]]]) - except KeyError: - pass - except KeyError: - pass - except KeyError: - pass - # Could raise as well--shouldn't be errors--but this helps back-compatability. - - try: - moreTables = self.tablesNeededForQuery(columns) - except MySQLdb.ProgrammingError: - # What happens on old-style Bookworm constructions. - moreTables = set() - self.relevantTables = list(self.relevantTables.union(moreTables)) - - - self.relevantTables = [self.fallback_table(t) for t in self.relevantTables] - - self.catalog = self.fallback_table("fastcat") - if self.catalog == "fastcat_": - self.prefs['fastcat'] = "fastcat_" - - for table in self.relevantTables: - if table!="fastcat" and table!="words" and table!="wordsheap" and table!="master_bookcounts" and table!="master_bigrams" and table != "fastcat_" and table != "wordsheap_": - self.catalog = self.catalog + """ NATURAL JOIN """ + table + " " - - def fallback_table(self,tabname): - """ - Fall back to the saved versions if the memory tables are unpopulated. - - Use a cache first to avoid unnecessary queries, though the overhead shouldn't be much. - """ - tab = tabname - if tab.endswith("_"): - return tab - if tab in ["words","master_bookcounts","master_bigrams","catalog"]: - return tab - - if not hasattr(self,"fallbacks_cache"): - self.fallbacks_cache = {} - - if tabname in self.fallbacks_cache: - return self.fallbacks_cache[tabname] - - q = "SELECT COUNT(*) FROM {}".format(tab) - try: - self.db.cursor.execute(q) - length = self.db.cursor.fetchall()[0][0] - if length==0: - tab += "_" - except MySQLdb.ProgrammingError: - tab += "_" - - self.fallbacks_cache[tabname] = tab - - return tab - - def make_catwhere(self): - # Where terms that don't include the words table join. Kept separate so that we can have subqueries only working on one half of the stack. - catlimits = dict() - for key in list(self.limits.keys()): - # !!Warning--none of these phrases can be used in a bookworm as a custom table names. - - if key not in ('word', 'word1', 'word2', 'hasword') and not re.search("words\d", key): - catlimits[key] = self.limits[key] - if len(list(catlimits.keys())) > 0: - self.catwhere = where_from_hash(catlimits) - else: - self.catwhere = "TRUE" - if 'hasword' in list(self.limits.keys()): - """ - Because derived tables don't carry indexes, we're just making the new tables - with indexes on the fly to be stored in a temporary database, "bookworm_scratch" - Each time a hasword query is performed, the results of that query are permanently cached; - they're stored as a table that can be used in the future. - - This will create problems if database contents are changed; there needs to be some mechanism for - clearing out the cache periodically. - """ - - if self.limits['hasword'] == []: - del self.limits['hasword'] - return - - # deepcopy lets us get a real copy of the dictionary - # that can be changed without affecting the old one. - mydict = copy.deepcopy(self.outside_dictionary) - # This may make it take longer than it should; we might want the list to - # just be every bookid with the given word rather than - # filtering by the limits as well. - # It's not obvious to me which will be faster. - mydict['search_limits'] = copy.deepcopy(self.limits) - if isinstance(mydict['search_limits']['hasword'], (str, bytes)): - # Make sure it's an array - mydict['search_limits']['hasword'] = [mydict['search_limits']['hasword']] - """ - # Ideally, this would shuffle into an order ensuring that the - rarest words were nested deepest. - # That would speed up query execution by ensuring there - wasn't some massive search for 'the' being - # done at the end. - - Instead, it just pops off the last element and sets up a - recursive nested join. for every element in the - array. - """ - mydict['search_limits']['word'] = [mydict['search_limits']['hasword'].pop()] - if len(mydict['search_limits']['hasword']) == 0: - del mydict['search_limits']['hasword'] - tempquery = userquery(mydict, databaseScheme=self.databaseScheme) - listofBookids = tempquery.bookid_query() - - # Unique identifier for the query that persists across the - # various subqueries. - queryID = hashlib.sha1(listofBookids).hexdigest()[:20] - - tmpcatalog = "bookworm_scratch.tmp" + re.sub("-", "", queryID) - - try: - self.cursor.execute("CREATE TABLE %s (bookid MEDIUMINT, PRIMARY KEY (bookid)) ENGINE=MYISAM;" %tmpcatalog) - self.cursor.execute("INSERT IGNORE INTO %s %s;" %(tmpcatalog, listofBookids)) - - except MySQLdb.OperationalError as e: - # Usually the error will be 1050, which is a good thing: it means we don't need to - # create the table. - # If it's not, something bad is happening. - if not re.search("1050.*already exists", str(e)): - raise - self.catalog += " NATURAL JOIN %s "%(tmpcatalog) - - def make_wordwheres(self): - self.wordswhere = " TRUE " - self.max_word_length = 0 - limits = [] - """ - "unigram" or "bigram" can be used as an alias for "word" in the search_limits field. - """ - - for gramterm in ['unigram', 'bigram']: - if gramterm in list(self.limits.keys()) and "word" not in list(self.limits.keys()): - self.limits['word'] = self.limits[gramterm] - del self.limits[gramterm] - - if 'word' in list(self.limits.keys()): - """ - This doesn't currently allow mixing of one and two word searches together in a logical way. - It might be possible to just join on both the tables in MySQL--I'm not completely sure what would happen. - But the philosophy has been to keep users from doing those searches as far as possible in any case. - """ - for phrase in self.limits['word']: - locallimits = dict() - array = phrase.split() - n = 0 - for word in array: - n += 1 - searchingFor = word - if self.word_field == "stem": - from nltk import PorterStemmer - searchingFor = PorterStemmer().stem_word(searchingFor) - if self.word_field == "case_insensitive" or self.word_field == "Case_Insensitive": - # That's a little joke. Get it? - searchingFor = searchingFor.lower() - selectString = "SELECT wordid FROM %s WHERE %s = %%s" % (self.wordsheap, self.word_field) - - logging.debug(selectString) - cursor = self.db.cursor - cursor.execute(selectString,(searchingFor,)) - for row in cursor.fetchall(): - wordid = row[0] - try: - locallimits['words'+str(n) + ".wordid"] += [wordid] - except KeyError: - locallimits['words'+str(n) + ".wordid"] = [wordid] - self.max_word_length = max(self.max_word_length, n) - - # Strings have already been escaped, so don't need to be escaped again. - if len(list(locallimits.keys())) > 0: - limits.append(where_from_hash(locallimits, comp = " = ", escapeStrings=False)) - # XXX for backward compatability - self.words_searched = phrase - # XXX end deprecated block - self.wordswhere = "(" + ' OR '.join(limits) + ")" - if limits == []: - # In the case that nothing has been found, tell it explicitly to search for - # a condition when nothing will be found. - self.wordswhere = "words1.wordid=-1" - - wordlimits = dict() - - limitlist = copy.deepcopy(list(self.limits.keys())) - - for key in limitlist: - if re.search("words\d", key): - wordlimits[key] = self.limits[key] - self.max_word_length = max(self.max_word_length, 2) - del self.limits[key] - - if len(list(wordlimits.keys())) > 0: - self.wordswhere = where_from_hash(wordlimits) - - return self.wordswhere - - def build_wordstables(self): - # Deduce the words tables we're joining against. The iterating on this can be made more general to get 3 or four grams in pretty easily. - # This relies on a determination already having been made about whether this is a unigram or bigram search; that's reflected in the self.selections - # variable. - - """ - We also now check for whether it needs the topic assignments: this could be generalized, with difficulty, for any other kind of plugin. - """ - - needsBigrams = (self.max_word_length == 2 or re.search("words2", self.selections)) - needsUnigrams = self.max_word_length == 1 or re.search("[^h][^a][^s]word", self.selections) - - if self.max_word_length > 2: - err = dict(code=400, message="Phrase is longer than what Bookworm supports") - raise BookwormException(err) - - needsTopics = bool(re.search("topic", self.selections)) or ("topic" in list(self.limits.keys())) - - if needsBigrams: - - self.maintable = 'master_bigrams' - - self.main = ''' - JOIN - master_bigrams as main - ON ('''+ self.prefs['fastcat'] +'''.bookid=main.bookid) - ''' - - self.wordstables = """ - JOIN %(wordsheap)s as words1 ON (main.word1 = words1.wordid) - JOIN %(wordsheap)s as words2 ON (main.word2 = words2.wordid) """ % self.__dict__ - - # I use a regex here to do a blanket search for any sort of word limitations. That has some messy sideffects (make sure the 'hasword' - # key has already been eliminated, for example!) but generally works. - - elif needsTopics and needsUnigrams: - self.maintable = 'master_topicWords' - self.main = ''' - NATURAL JOIN - master_topicWords as main - ''' - self.wordstables = """ - JOIN ( %(wordsheap)s as words1) ON (main.wordid = words1.wordid) - """ % self.__dict__ - - elif needsUnigrams: - self.maintable = 'master_bookcounts' - self.main = ''' - NATURAL JOIN - master_bookcounts as main - ''' - - self.wordstables = """ - JOIN ( %(wordsheap)s as words1) ON (main.wordid = words1.wordid) - """ % self.__dict__ - - elif needsTopics: - self.maintable = 'master_topicCounts' - self.main = ''' - NATURAL JOIN - master_topicCounts as main ''' - self.wordstables = " " - self.wordswhere = " TRUE " - - else: - """ - Have _no_ words table if no words searched for or grouped by; - instead just use nwords. This - means that we can use the same basic functions both to build the - counts for word searches and - for metadata searches, which is valuable because there is a - metadata-only search built in to every single ratio - query. (To get the denominator values). - - Call this OLAP, if you like. - """ - self.main = " " - self.operation = ','.join(self.catoperations) - """ - This, above is super important: the operation used is relative to the counttype, and changes to use 'catoperation' instead of 'bookoperation' - That's the place that the denominator queries avoid having to do a table scan on full bookcounts that would take hours, and instead takes - milliseconds. - """ - self.wordstables = " " - self.wordswhere = " TRUE " - # Just a dummy thing to make the SQL writing easier. Shouldn't take any time. Will usually be extended with actual conditions. - - def set_operations(self): - """ - This is the code that allows multiple values to be selected. - - All can be removed when we kill back compatibility ! It's all handled now by the general_API, not the SQL_API. - """ - - backCompatability = {"Occurrences_per_Million_Words":"WordsPerMillion", "Raw_Counts":"WordCount", "Percentage_of_Books":"TextPercent", "Number_of_Books":"TextCount"} - - for oldKey in list(backCompatability.keys()): - self.counttype = [re.sub(oldKey, backCompatability[oldKey], entry) for entry in self.counttype] - - self.bookoperation = {} - self.catoperation = {} - self.finaloperation = {} - - # Text statistics - self.bookoperation['TextPercent'] = "count(DISTINCT " + self.prefs['fastcat'] + ".bookid) as TextCount" - self.bookoperation['TextRatio'] = "count(DISTINCT " + self.prefs['fastcat'] + ".bookid) as TextCount" - self.bookoperation['TextCount'] = "count(DISTINCT " + self.prefs['fastcat'] + ".bookid) as TextCount" - - # Word Statistics - self.bookoperation['WordCount'] = "sum(main.count) as WordCount" - self.bookoperation['WordsPerMillion'] = "sum(main.count) as WordCount" - self.bookoperation['WordsRatio'] = "sum(main.count) as WordCount" - - """ - +Total Numbers for comparisons/significance assessments - This is a little tricky. The total words is EITHER the denominator (as in a query against words per Million) or the numerator+denominator (if you're comparing - Pittsburg and Pittsburgh, say, and want to know the total number of uses of the lemma. For now, "TotalWords" means the former and "SumWords" the latter, - On the theory that 'TotalWords' is more intuitive and only I (Ben) will be using SumWords all that much. - """ - self.bookoperation['TotalWords'] = self.bookoperation['WordsPerMillion'] - self.bookoperation['SumWords'] = self.bookoperation['WordsPerMillion'] - self.bookoperation['TotalTexts'] = self.bookoperation['TextCount'] - self.bookoperation['SumTexts'] = self.bookoperation['TextCount'] - - for stattype in list(self.bookoperation.keys()): - if re.search("Word", stattype): - self.catoperation[stattype] = "sum(nwords) as WordCount" - if re.search("Text", stattype): - self.catoperation[stattype] = "count(nwords) as TextCount" - - self.finaloperation['TextPercent'] = "IFNULL(numerator.TextCount,0)/IFNULL(denominator.TextCount,0)*100 as TextPercent" - self.finaloperation['TextRatio'] = "IFNULL(numerator.TextCount,0)/IFNULL(denominator.TextCount,0) as TextRatio" - self.finaloperation['TextCount'] = "IFNULL(numerator.TextCount,0) as TextCount" - - self.finaloperation['WordsPerMillion'] = "IFNULL(numerator.WordCount,0)*100000000/IFNULL(denominator.WordCount,0)/100 as WordsPerMillion" - self.finaloperation['WordsRatio'] = "IFNULL(numerator.WordCount,0)/IFNULL(denominator.WordCount,0) as WordsRatio" - self.finaloperation['WordCount'] = "IFNULL(numerator.WordCount,0) as WordCount" - - self.finaloperation['TotalWords'] = "IFNULL(denominator.WordCount,0) as TotalWords" - self.finaloperation['SumWords'] = "IFNULL(denominator.WordCount,0) + IFNULL(numerator.WordCount,0) as SumWords" - self.finaloperation['TotalTexts'] = "IFNULL(denominator.TextCount,0) as TotalTexts" - self.finaloperation['SumTexts'] = "IFNULL(denominator.TextCount,0) + IFNULL(numerator.TextCount,0) as SumTexts" - - """ - The values here will be chosen in build_wordstables; that's what decides if it uses the 'bookoperation' or 'catoperation' dictionary to build out. - """ - - self.finaloperations = list() - self.bookoperations = set() - self.catoperations = set() - - for summaryStat in self.counttype: - self.catoperations.add(self.catoperation[summaryStat]) - self.bookoperations.add(self.bookoperation[summaryStat]) - self.finaloperations.append(self.finaloperation[summaryStat]) - - def counts_query(self): - - self.operation = ','.join(self.bookoperations) - self.build_wordstables() - - countsQuery = """ - SELECT - %(selections)s, - %(operation)s - FROM - %(catalog)s - %(main)s - %(wordstables)s - WHERE - %(catwhere)s AND %(wordswhere)s - GROUP BY - %(groupings)s - """ % self.__dict__ - return countsQuery - - def bookid_query(self): - # A temporary method to setup the hasword query. - self.operation = ','.join(self.bookoperations) - self.build_wordstables() - - countsQuery = """ - SELECT - main.bookid as bookid - FROM - %(catalog)s - %(main)s - %(wordstables)s - WHERE - %(catwhere)s AND %(wordswhere)s - """ % self.__dict__ - return countsQuery - - def debug_query(self): - query = self.ratio_query(materialize = False) - return json.dumps(self.denominator.groupings.split(",")) + query - - def query(self, materialize=False): - """ - We launch a whole new userquery instance here to build the denominator, based on the 'compare_dictionary' option (which in most - cases is the search_limits without the keys, see above; it can also be specially defined using asterisks as a shorthand to identify other fields to drop. - We then get the counts_query results out of that result. - """ - - """ - self.denominator = userquery(outside_dictionary = self.compare_dictionary,db=self.db,databaseScheme=self.databaseScheme) - self.supersetquery = self.denominator.counts_query() - supersetIndices = self.denominator.groupings.split(",") - if materialize: - self.supersetquery = derived_table(self.supersetquery,self.db,indices=supersetIndices).materialize() - """ - self.mainquery = self.counts_query() - self.countcommand = ','.join(self.finaloperations) - self.totalselections = ",".join([group for group in self.outerGroups if group!="1 as In_Library" and group != ""]) - if self.totalselections != "": - self.totalselections += ", " - - query = """ - SELECT - %(totalselections)s - %(countcommand)s - FROM - (%(mainquery)s) as numerator - %(joinSuffix)s - GROUP BY %(groupings)s;""" % self.__dict__ - - logging.debug("Query: %s" % query) - return query - - def returnPossibleFields(self): - try: - self.cursor.execute("SELECT name,type,description,tablename,dbname,anchor FROM masterVariableTable WHERE status='public'") - colnames = [line[0] for line in self.cursor.description] - returnset = [] - for line in self.cursor.fetchall(): - thisEntry = {} - for i in range(len(line)): - thisEntry[colnames[i]] = line[i] - returnset.append(thisEntry) - except: - returnset=[] - return returnset - - def bibliography_query(self, limit = "100"): - # I'd like to redo this at some point so it could work as an API call more naturally. - self.limit = limit - self.ordertype = "sum(main.count*10000/nwords)" - try: - if self.outside_dictionary['ordertype'] == "random": - if self.counttype == ["Raw_Counts"] or self.counttype == ["Number_of_Books"] or self.counttype == ['WordCount'] or self.counttype == ['BookCount'] or self.counttype == ['TextCount']: - self.ordertype = "RAND()" - else: - # This is a based on an attempt to match various different distributions I found on the web somewhere to give - # weighted results based on the counts. It's not perfect, but might be good enough. Actually doing a weighted random search is not easy without - # massive memory usage inside sql. - self.ordertype = "LOG(1-RAND())/sum(main.count)" - except KeyError: - pass - - # If IDF searching is enabled, we could add a term like '*IDF' here to overweight better selecting words - # in the event of a multiple search. - self.idfterm = "" - prep = self.counts_query() - - if self.main == " ": - self.ordertype="RAND()" - - bibQuery = """ - SELECT searchstring - FROM """ % self.__dict__ + self.prefs['fullcat'] + """ RIGHT JOIN ( - SELECT - """+ self.prefs['fastcat'] + """.bookid, %(ordertype)s as ordering - FROM - %(catalog)s - %(main)s - %(wordstables)s - WHERE - %(catwhere)s AND %(wordswhere)s - GROUP BY bookid ORDER BY %(ordertype)s DESC LIMIT %(limit)s - ) as tmp USING(bookid) ORDER BY ordering DESC; - """ % self.__dict__ - return bibQuery - - def disk_query(self, limit="100"): - pass - - def return_books(self): - # This preps up the display elements for a search: it returns an array with a single string for each book, sorted in the best possible way - silent = self.cursor.execute(self.bibliography_query()) - returnarray = [] - for line in self.cursor.fetchall(): - returnarray.append(line[0]) - if not returnarray: - # why would someone request a search with no locations? - # Turns out (usually) because the smoothing tricked them. - returnarray.append("") - newerarray = self.custom_SearchString_additions(returnarray) - return json.dumps(newerarray) - - def search_results(self): - # This is an alias that is handled slightly differently in - # APIimplementation (no "RESULTS" bit in front). Once - # that legacy code is cleared out, they can be one and the same. - - return json.loads(self.return_books()) - - def getActualSearchedWords(self): - if len(self.wordswhere) > 7: - words = self.outside_dictionary['search_limits']['word'] - # Break bigrams into single words. - words = ' '.join(words).split(' ') - self.cursor.execute("SELECT word FROM {} WHERE {}".format(self.wordsheap, where_from_hash({self.word_field:words}))) - self.actualWords = [item[0] for item in self.cursor.fetchall()] - else: - raise TypeError("Suspiciously low word count") - self.actualWords = ["tasty", "mistake", "happened", "here"] - - def custom_SearchString_additions(self, returnarray): - """ - It's nice to highlight the words searched for. This will be on partner web sites, so requires custom code for different databases - """ - db = self.outside_dictionary['database'] - if db in ('jstor', 'presidio', 'ChronAm', 'LOC', 'OL'): - self.getActualSearchedWords() - if db == 'jstor': - joiner = "&searchText=" - preface = "?Search=yes&searchText=" - urlRegEx = "http://www.jstor.org/stable/\d+" - if db == 'presidio' or db == 'OL': - joiner = "+" - preface = "# page/1/mode/2up/search/" - urlRegEx = 'http://archive.org/stream/[^"# ><]*' - if db in ('ChronAm', 'LOC'): - preface = "/;words=" - joiner = "+" - urlRegEx = 'http://chroniclingamerica.loc.gov[^\"><]*/seq-\d+' - newarray = [] - for string in returnarray: - try: - base = re.findall(urlRegEx, string)[0] - newcore = ' search inside ' - string = re.sub("^", "", string) - string = re.sub("$", "", string) - string = string+newcore - except IndexError: - pass - newarray.append(string) - # Arxiv is messier, requiring a whole different URL interface: http://search.arxiv.org:8081/paper.jsp?r=1204.3352&qs=netwokr - else: - newarray = returnarray - return newarray - - def return_tsv(self, query = "ratio_query"): - if self.outside_dictionary['counttype'] == "Raw_Counts" or self.outside_dictionary['counttype'] == ["Raw_Counts"]: - query="counts_query" - # This allows much speedier access to counts data if you're - # willing not to know about all the zeroes. - # Will not work as well once the id_fields are in use. - querytext = getattr(self, query)() - silent = self.cursor.execute(querytext) - results = ["\t".join([to_unicode(item[0]) for item in self.cursor.description])] - lines = self.cursor.fetchall() - for line in lines: - items = [] - for item in line: - item = to_unicode(item) - item = re.sub("\t", "", item) - items.append(item) - results.append("\t".join(items)) - return "\n".join(results) - - def execute(self): - # This performs the query using the method specified in the passed parameters. - if self.method == "Nothing": - pass - else: - value = getattr(self, self.method)() - return value - -class derived_table(object): - """ - MySQL/MariaDB doesn't have good subquery materialization, - so I'm implementing it by hand. - """ - def __init__(self, SQLstring, db, indices = [], dbToPutIn = "bookworm_scratch"): - """ - initialize with the code to create the table; the database it will be in - (to prevent conflicts with other identical queries in other dbs); - and the list of all tables to be indexed - (optional, but which can really speed up joins) - """ - self.query = SQLstring - self.db = db - # Each query is identified by a unique key hashed - # from the query and the dbname. - self.queryID = dbToPutIn + "." + "derived" + hashlib.sha1(self.query + db.dbname).hexdigest() - self.indices = "(" + ",".join(["INDEX(%s)" % index for index in indices]) + ")" if indices != [] else "" - - def setStorageEngines(self, temp): - """ - Chooses where and how to store tables. - """ - self.tempString = "TEMPORARY" if temp else "" - self.engine = "MEMORY" if temp else "MYISAM" - - def checkCache(self): - """ - Checks what's already been calculated. - """ - try: - (self.count, self.created, self.modified, self.createCode, self.data) = self.db.cursor.execute("SELECT count,created,modified,createCode,data FROM bookworm_scratch.cache WHERE fieldname='%s'" %self.queryID)[0] - return True - except: - (self.count, self.created, self.modified, self.createCode, self.data) = [None]*5 - return False - - def fillTableWithData(self, data): - dataCode = "INSERT INTO %s values ("%self.queryID + ", ".join(["%s"]*len(data[0])) + ")" - self.db.cursor.executemany(dataCode, data) - self.db.db.commit() - - -class databaseSchema(object): - """ - This class stores information about the database setup that is used to optimize query creation query - and so that queries know what tables to include. - It's broken off like this because it might be usefully wrapped around some of the backend features, - because it shouldn't be run multiple times in a single query (that spawns two instances of itself), as was happening before. - - It's closely related to some of the classes around variables and variableSets in the Bookworm Creation scripts, - but is kept separate for now: that allows a bit more flexibility, but is probaby a Bad Thing in the long run. - """ - - def __init__(self, db): - self.db = db - self.cursor=db.cursor - # has of what table each variable is in - self.tableToLookIn = {} - # hash of what the root variable for each search term is (eg, 'author_birth' might be crosswalked to 'authorid' in the main catalog.) - self.anchorFields = {} - # aliases: a hash showing internal identifications codes that dramatically speed up query time, but which shouldn't be exposed. - # So you can run a search for "state," say, and the database will group on a 50-element integer code instead of a VARCHAR that - # has to be long enough to support "Massachusetts" and "North Carolina." - # A couple are hard-coded in, but most are derived by looking for fields that end in the suffix "__id" later. - - if self.db.dbname == "presidio": - self.aliases = {"classification":"lc1", "lat":"pointid", "lng":"pointid"} - else: - self.aliases = dict() - - try: - # First build using the new streamlined tables; if that fails, - # build using the old version that hits the INFORMATION_SCHEMA, - # which is bad practice. - self.newStyle(db) - except: - # The new style will fail on old bookworms: a failure is an easy way to test - # for oldness, though of course something else might be causing the failure. - self.oldStyle(db) - - def newStyle(self, db): - self.tableToLookIn['bookid'] = self.fallback_table('fastcat') - self.anchorFields['bookid'] = self.fallback_table('fastcat') - self.anchorFields['wordid'] = 'wordid' - self.tableToLookIn['wordid'] = self.wordsheap - - tablenames = dict() - tableDepends = dict() - db.cursor.execute("SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);") - for row in db.cursor.fetchall(): - (dbname, alias, tablename, dependsOn) = row - self.tableToLookIn[dbname] = tablename - self.anchorFields[tablename] = dependsOn - self.aliases[dbname] = alias - - def oldStyle(self, db): - - # This is sorted by engine DESC so that memory table locations will overwrite disk table in the hash. - - self.cursor.execute("SELECT ENGINE,TABLE_NAME,COLUMN_NAME,COLUMN_KEY,TABLE_NAME='fastcat' OR TABLE_NAME='wordsheap' AS privileged FROM information_schema.COLUMNS JOIN INFORMATION_SCHEMA.TABLES USING (TABLE_NAME,TABLE_SCHEMA) WHERE TABLE_SCHEMA='%(dbname)s' ORDER BY privileged,ENGINE DESC,TABLE_NAME,COLUMN_KEY DESC;" % self.db.__dict__) - columnNames = self.cursor.fetchall() - - parent = 'bookid' - previous = None - for databaseColumn in columnNames: - if previous != databaseColumn[1]: - if databaseColumn[3] == 'PRI' or databaseColumn[3] == 'MUL': - parent = databaseColumn[2] - previous = databaseColumn[1] - else: - parent = 'bookid' - else: - self.anchorFields[databaseColumn[2]] = parent - if databaseColumn[3]!='PRI' and databaseColumn[3]!="MUL": # if it's a primary key, this isn't the right place to find it. - self.tableToLookIn[databaseColumn[2]] = databaseColumn[1] - if re.search('__id\*?$', databaseColumn[2]): - self.aliases[re.sub('__id', '', databaseColumn[2])]=databaseColumn[2] - - try: - cursor = self.cursor.execute("SELECT dbname,tablename,anchor,alias FROM masterVariableTables") - for row in cursor.fetchall(): - if row[0] != row[3]: - self.aliases[row[0]] = row[3] - if row[0] != row[2]: - self.anchorFields[row[0]] = row[2] - # Should be uncommented, but some temporary issues with the building script - # self.tableToLookIn[row[0]] = row[1] - except: - pass - self.tableToLookIn['bookid'] = 'fastcat' - self.anchorFields['bookid'] = 'fastcat' - self.anchorFields['wordid'] = 'wordid' - self.tableToLookIn['wordid'] = 'wordsheap' - -def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_joiner = " OR "): - whereterm = [] - # The general idea here is that we try to break everything in search_limits down to a list, and then create a whereterm on that joined by whatever the 'joiner' is ("AND" or "OR"), with the comparison as whatever comp is ("=",">=",etc.). - # For more complicated bits, it gets all recursive until the bits are all in terms of list. - if joiner is None: - joiner = " AND " - for key in list(myhash.keys()): - values = myhash[key] - if isinstance(values, (str, bytes)) or isinstance(values, int) or isinstance(values, float): - # This is just human-being handling. You can pass a single value instead of a list if you like, and it will just convert it - # to a list for you. - values = [values] - # Or queries are special, since the default is "AND". This toggles that around for a subportion. - - if key == "$or" or key == "$OR": - local_set = [] - for comparison in values: - local_set.append(where_from_hash(comparison, comp=comp)) - whereterm.append(" ( " + " OR ".join(local_set) + " )") - elif key == '$and' or key == "$AND": - for comparison in values: - whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) - elif isinstance(values, dict): - if joiner is None: - joiner = " AND " - # Certain function operators can use MySQL terms. - # These are the only cases that a dict can be passed as a limitations - operations = {"$gt":">", "$ne":"!=", "$lt":"<", - "$grep":" REGEXP ", "$gte":">=", - "$lte":"<=", "$eq":"="} - - for operation in list(values.keys()): - if operation == "$ne": - # If you pass a lot of ne values, they must *all* be false. - subjoiner = " AND " - else: - subjoiner = " OR " - whereterm.append(where_from_hash({key:values[operation]}, comp=operations[operation], list_joiner=subjoiner)) - elif isinstance(values, list): - # and this is where the magic actually happens: - # the cases where the key is a string, and the target is a list. - if isinstance(values[0], dict): - # If it's a list of dicts, then there's one thing that happens. - # Currently all types are assumed to be the same: - # you couldn't pass in, say {"year":[{"$gte":1900}, 1898]} to - # catch post-1898 years except for 1899. Not that you - # should need to. - for entry in values: - whereterm.append(where_from_hash(entry)) - else: - # Note that about a third of the code is spent on escaping strings. - if escapeStrings: - if isinstance(values[0], (str, bytes)): - quotesep = "'" - else: - quotesep = "" - - def escape(value): - # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. - return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8') - else: - def escape(value): - return to_unicode(value) - quotesep = "" - - joined = list_joiner.join([" ({}{}{}{}{}) ".format(key, comp, quotesep, escape(value), quotesep) for value in values]) - whereterm.append(" ( {} ) ".format(joined)) - - if len(whereterm) > 1: - return "(" + joiner.join(whereterm) + ")" - else: - return whereterm[0] - # This works pretty well, except that it requires very specific sorts of terms going in, I think. diff --git a/bookwormDB/__init__.py b/bookwormDB/__init__.py index e69de29..67ca668 100644 --- a/bookwormDB/__init__.py +++ b/bookwormDB/__init__.py @@ -0,0 +1,4 @@ +from .builder import BookwormCorpus +from .general_API import DuckDBCall + +n = 1 \ No newline at end of file diff --git a/bookwormDB/bin/dbbindings-flask.py b/bookwormDB/bin/dbbindings-flask.py index 281bb47..a2ce05d 100755 --- a/bookwormDB/bin/dbbindings-flask.py +++ b/bookwormDB/bin/dbbindings-flask.py @@ -5,6 +5,8 @@ from flask import Flask, request, Response, jsonify import json import os +import logging +logger = logging.getLogger("bookworm") app = Flask(__name__) @@ -18,7 +20,6 @@ def index(): @app.route('/debug') def debug_api(): - import logging logging.basicConfig(level=logging.INFO) JSONinput = request.args.get('queryTerms') or request.args.get('query') if not JSONinput: diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py new file mode 100644 index 0000000..b2fc05e --- /dev/null +++ b/bookwormDB/builder.py @@ -0,0 +1,256 @@ +from ducksauce import from_files +import duckdb +import numpy as np +from base64 import b64encode, b64decode +import pyarrow as pa +from nonconsumptive import Corpus +from nonconsumptive.metadata import Catalog +from pathlib import Path +import logging +from pyarrow import feather, parquet, dataset +from ducksauce import from_files +logger = logging.getLogger("bookworm") + +class BookwormCorpus(Corpus): + """ + Create a Bookworm corpus. Uses write db locations, so should + not be used to managed existing ones or in a multi-threaded context. + """ + + def __init__(self, db_location, ngrams, *args, **kwargs): + self.db_location = Path(db_location) + self._connection = None + self.ngrams = ngrams + super().__init__(*args, **kwargs) + + def encoded_batches(self): + for batch in self.encoded_wordcounts(): + yield batch + + def bookworm_name(self): + return self.db_location.with_suffix("").name + + """ + def ingest_ngrams_ncid(self, levels = ['bigram'], block_size = 2_500_000_000): + con = self.con + for i, f in enumerate(levels): + con.execute(f"DROP TABLE IF EXISTS {f}__ncid") + ngrams = i + 1 + logging.info(f"Creating {f} table.") + sort_order = [f"word{i + 1}" for i in range(ngrams)] + ["_ncid"] + ingest_file = self.root / f"{f}__ncid.parquet" + inputs = [*(self.root / f"encoded_{f}s").glob("*")] + print(inputs) + from_files(inputs, sort_order, ingest_file, block_size = block_size) + con.execute(f"CREATE TABLE {f}__ncid AS SELECT {} FROM parquet_scan('{ingest_file}')") + """ + def prepare_metadata(self): + self.metadata.to_flat_catalog() + + def flat_tabs(self): + """ + Level-3 normalized database tables with integer keys for faster grouping and selection. + """ + return (self.root / "metadata" / "flat_catalog").glob("*.parquet") + + @property + def con(self): + if self._connection is not None: + return self._connection + self._connection = duckdb.connect(str(self.db_location)) + return self._connection + + + def ingest_wordids(self): + con = self.con + fin = self.root / 'wordids.feather' + word_table = pa.feather.read_table(fin) + pa.parquet.write_table(word_table, fin.with_suffix(".parquet")) + logger.debug("INGESTING INTO words") + con.execute(f"CREATE TABLE words AS SELECT * FROM parquet_scan('{self.root / 'wordids.parquet'}')") + logger.debug("INGESTING INTO wordsheap") + con.execute(f"CREATE TABLE wordsheap AS SELECT wordid, token as word, lower(token) as lowercase FROM words") + + def create_sorted_ngram_ncid(self, ngrams : int, force : bool = False): + lookup = [ + None, + "unigram", + "bigram", + "trigram", + "quadgram", + "quintgram" + ] + name = lookup[ngrams] + + path = self.root / f'{name}__ncid.parquet' + inputs : List[Path]= [*(self.root / f'encoded_{name}s').glob("*.feather")] + columns : List[str] = [] # Defined below. + if path.exists(): + last_time = max([p.stat().st_mtime for p in inputs]) + if path.stat().st_mtime < last_time or force: + path.unlink() + else: + return + if ngrams > 1: + columns = [f'word{i + 1}' for i in range(ngrams)] + else: + columns = ['wordid'] + logging.getLogger("ducksauce").setLevel(logging.getLogger("bookworm").level) + from_files(inputs, [*columns, '_ncid'], path, block_size = 1_000_000_000) + + def ingest_unigram__ncid(self): + con = self.con + self.create_sorted_ngram_ncid(ngrams = 1) + wordids = self.root / 'unigram__ncid.parquet' + con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") + schema = parquet.ParquetFile(wordids).schema_arrow + self.insert_table_schema("unigram__ncid", schema) + +# wordids.unlink() + """ + encoded = dataset.dataset(self.root / 'encoded_unigrams', format = "feather") + con.register_arrow("unigrams_dataset", encoded) + con.execute("CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT wordid, _ncid, count FROM unigrams_dataset ORDER BY wordid, _ncid") + """ + + def ingest_ngram__ncid(self, ngrams = 2): + con = self.con + lookup = [ + None, + None, + "bigram", + "trigram", + "quadgram", + "quintgram" + ] + name = lookup[ngrams] + + wordids = self.root / f'{name}__ncid.parquet' + self.create_sorted_ngram_ncid(ngrams) + con.execute(f"CREATE TABLE IF NOT EXISTS {name}__ncid AS SELECT * FROM parquet_scan('{wordids}')") + schema = parquet.ParquetFile(wordids).schema_arrow + self.insert_table_schema(f"{name}__ncid", schema) + + """ + encoded = dataset.dataset(self.root / f'encoded_{name}s', format = "feather") + con.register_arrow("dataset", encoded) + word_cols = ", ".join([f"word{i + 1}" for i in range(ngrams)]) + + con.execute(f"DROP TABLE IF EXISTS {name}__ncid") + con.execute(f"CREATE TABLE IF NOT EXISTS {name}__ncid AS SELECT {word_cols}, _ncid, count " + f"FROM dataset order by {word_cols}, _ncid") + self.insert_table_schema(name + "__ncid", encoded.schema) + """ + + def ingest_metadata(self) -> None: + for tabpath in self.flat_tabs(): + name = tabpath.with_suffix("").name + self.con.execute(f"CREATE TABLE {name} AS SELECT * FROM parquet_scan('{tabpath}')") + + def create_table_schemas(self): + con = self.con + insertion = 'INSERT INTO arrow_schemas VALUES (?, ?, ?)' + + rich = self.metadata.tb + con.execute(insertion, ("catalog_ld", b64encode(rich.schema.serialize().to_pybytes()), "resource")) + + ## Insert schemas into the database for later retrieval to understand the db structure + # Stash as base64 b/c + # DuckDB can't yet handle blob inserts from python. + # https://github.com/duckdb/duckdb/issues/1703 + + for tab in [*self.flat_tabs()] + [self.root / 'wordids.parquet']: + tabname = tab.with_suffix("").name + if tabname in ["sorted", "wordids"]: + continue + con.execute(insertion, + (tabname, b64encode(pa.parquet.ParquetFile(tab).schema_arrow.serialize().to_pybytes()), + "table")) + + def insert_table_schema(self, tabname, schema): + con = self.con + insertion = 'INSERT INTO arrow_schemas VALUES (?, ?, ?)' + con.execute(insertion, (tabname, b64encode(schema.serialize().to_pybytes()), "table")) + + def create_slow_catalog(self): + con = self.con + catcols = set(con.execute("DESCRIBE TABLE catalog").df()['Field']) + fastcols = set(con.execute("DESCRIBE TABLE fastcat").df()['Field']) + unique = ["_ncid"] + for col in catcols: + if col in fastcols or f"{col}__id" in fastcols: + continue + unique.append(f'"{col}"') + con.execute(f"CREATE VIEW slowcat AS SELECT {','.join(unique)} FROM catalog") + + def ingest_wordcounts(self): + """ + The total wordcounts for each document. + """ + self.con.execute('DROP TABLE IF EXISTS nwords') + self.con.execute('CREATE TABLE nwords ("@id" STRING, "nwords" INTEGER)') + logger.info("Creating nwords") + seen_a_word = False + for batch in self.iter_over('document_lengths', ids = "@id"): + seen_a_word = True + tb = pa.Table.from_batches([batch]) + self.con.register("t", tb.to_pandas()) + self.con.execute('INSERT INTO nwords ("@id", nwords) SELECT * FROM t') + self.con.unregister("t") + if not seen_a_word: + raise FileNotFoundError("No document lengths for corpus.") + logger.info("Creating nwords on `catalog`") + self.con.execute("ALTER TABLE catalog ADD nwords INTEGER") + logger.info("Updating nwords on `catalog` from nwords table.") + self.con.execute('UPDATE catalog SET nwords = nwords.nwords FROM nwords WHERE "catalog"."@id" = "nwords"."@id"') + logger.info("Creating nwords on `fastcat`.") + self.con.execute("ALTER TABLE fastcat ADD nwords INTEGER") + logger.info("Updating nwords on `fastcat` from catalog table.") + self.con.execute('UPDATE fastcat SET nwords = catalog.nwords FROM catalog WHERE fastcat._ncid = catalog._ncid') + + def build(self): + logger.info("Preparing metadata") + self.prepare_metadata() + + logger.info("Creating unigrams for duck ingest") + for k in ["token_counts", "tokenization"]: + if k in self.cache_set: + self.multiprocess(k) + self.total_wordcounts # To cache it + self.multiprocess("encoded_unigrams") + ngrams = self.ngrams + if ngrams > 1: + self.multiprocess("encoded_bigrams") + if ngrams > 2: + self.multiprocess("encoded_trigrams") + if ngrams > 3: + self.multiprocess("encoded_quadgrams") + if ngrams > 4: + self.multiprocess("encoded_quintgrams") + + con = self.con + con.execute('CREATE TABLE IF NOT EXISTS arrow_schemas (name VARCHAR, schema VARCHAR, type VARCHAR)') + + self.ingest_wordids() + logger.info("Sorting and ingesting unigrams") + self.ingest_unigram__ncid() + logger.info("Ingesting metadata") + self.ingest_metadata() + logger.info("Creating schemas for load") + + self.ingest_wordcounts() + for i in range(ngrams): + grams = i + 1 + if i == 0: + continue + logger.info(f"Ingesting {i}grams") + self.ingest_ngram__ncid(grams) + + self.create_table_schemas() + + logger.info("Building slow catalog view") + self.create_slow_catalog() + self.con.close() + self._connection = duckdb.connect(str(self.db_location), read_only = True) + +RESERVED_NAMES = ["slowcat", "fastcat", "catalog", "my_nwords", "unigram__ncid"] \ No newline at end of file diff --git a/bookwormDB/configuration.py b/bookwormDB/configuration.py index 5e8a77e..146400c 100644 --- a/bookwormDB/configuration.py +++ b/bookwormDB/configuration.py @@ -1,213 +1,4 @@ -#!/usr/bin/python -from __future__ import print_function -import configparser -import os -import sys -import re -import MySQLdb -import argparse -import getpass -import subprocess -import logging -import uuid - -def update(): - ## Assemble list of all bookworms on the system. - - bookworms = [] ### ... - - ## Create on-disk versions of memory tables if 'fastcat_' does not exists. - - pass - - ## Allow "'bookworm'@'localhost' IDENTIFIED BY ''" to have select access on each bookworm. - - pass - - ## Print a message about enabling access. - - pass - - -def create(ask_about_defaults=True, database=None): - """ - Through interactive prompts at the command line, builds up a file at - bookworm.cnf that can be used to set preferences for the installation. - """ - - if ask_about_defaults: - print(""" - Welcome to Bookworm. - ~~~~~~~~~~~~~~~~~~~~ - First off, let's build a configuration file. This will live - at bookworm.cnf in the current directory: if you mistype anything, - or want to change settings, edit it directly in that location. - - For each of the following entries, type the value you want, or hit - enter to accept the default: - - """) - else: - logging.info("Auto-generating config file.") - - """ - First, we go to great efforts to find some sensible defaults - Usually the user can just hit enter. - """ - - systemConfigFile = configparser.SafeConfigParser(allow_no_value=True) - - defaults = dict() - # The default bookwormname is just the current location - - if database is None: - defaults['database'] = os.path.relpath(".", "..") - else: - defaults['database'] = database - - defaults["user"] = "bookworm" - defaults["password"] = "" - - config = configparser.ConfigParser() - - for section in ["client"]: - config.add_section(section) - - if ask_about_defaults: - database = input("What is the name of the bookworm [" + defaults['database'] + "]: ") - else: - database = defaults['database'] - - config.set("client", "database", re.sub(" ","_",database)) - config.write(open("bookworm.cnf", "w")) - -class Configfile(object): - def __init__(self, usertype, possible_locations=None, default=None, ask_about_defaults=True): - """ - Initialize with the type of the user. The last encountered file on - the list is the one that will be used. - If default is set, a file will be created at that location if none - of the files in possible_locations exist. - - If ask_about_defaults is false, it will do a force installation. - """ - - if not usertype in ['read_only', 'admin']: - raise NotImplementedError("Only read_only and admin supported") - - self.ask_about_defaults = ask_about_defaults - - logging.info("Creating configuration as " + usertype) - - self.usertype = usertype - - if possible_locations is None: - possible_locations = self.default_locations_from_type(usertype) - - self.location = None - - self.config = configparser.ConfigParser(allow_no_value=True) - - if usertype=="admin": - - self.ensure_section("client") - self.ensure_section("mysqld") - - self.config.set("client", "host", "localhost") - self.config.set("client", "user", "root") - self.config.set("client", "password", "") - - else: - self.ensure_section("client") - self.config.set("client", "host", "localhost") - self.config.set("client", "user", "bookworm") - self.config.set("client", "password", "") - - self.read_config_files(possible_locations) - - for string in possible_locations: - if os.path.exists(string): - self.location = string - - - def read_config_files(self, used_files): - - try: - self.config.read(used_files) - except configparser.MissingSectionHeaderError: - """ - Some files throw this error if you have an empty - my.cnf. This throws those out of the list, and tries again. - """ - for file in used_files: - try: - self.config.read(file) - except configparser.MissingSectionHeaderError: - used_files.remove(file) - successes = self.config.read(used_files) - - - - def default_locations_from_type(self,usertype): - """ - The default locations for each usertype. - Note that these are in ascending order of importance: - so the preferred location for admin and read_only configuration - is in /etc/bookworm/admin.cnf - and /etc/bookworm/client.cnf - """ - - if usertype=="admin": - return [os.path.abspath(os.path.expanduser("~/.my.cnf")), - os.path.abspath(os.path.expanduser("~/my.cnf")), - "/etc/bookworm/admin.cnf"] - if usertype == "read_only": - return ["~/.bookworm-sql.cnf", "/etc/bookworm/client.cnf"] - else: - return [] - - def ensure_section(self,section): - if not self.config.has_section(section): - self.config.add_section(section) - - def set_bookworm_options(self): - """ - A number of specific MySQL changes to ensure fast queries on Bookworm. - """ - self.ensure_section("mysqld") - - mysqldoptions = {"### = =": "THIS FILE SHOULD GENERALLY BE PLACED AT /etc/mysql/my.cnf = = = ###", "max_allowed_packet":"512M","sort_buffer_size":"8M","read_buffer_size":"8M","read_rnd_buffer_size":"8M","bulk_insert_buffer_size":"512M","myisam_sort_buffer_size":"5512M","myisam_max_sort_file_size":"5500G","key_buffer_size":"2500M","query_cache_size":"32M","tmp_table_size":"1024M","max_heap_table_size":"2048M","character_set_server":"utf8","query_cache_type":"1","query_cache_limit":"8M"} - - for option in list(mysqldoptions.keys()): - if not self.config.has_option("mysqld",option): - self.config.set("mysqld", option, mysqldoptions[option]) - else: - if mysqldoptions[option] != self.config.get("mysqld",option): - choice = input("Do you want to change the value for " + option + " from " + self.config.get("mysqld",option) + " to the bookworm-recommended " + mysqldoptions[option] + "? (y/N): ") - if choice=="y": - self.config.set("mysqld",option,mysqldoptions[option]) - - self.write_out() - - def write_out(self): - """ - Write out a new version of the configfile to stdout. - The user is responsible for putting this somewhere it will - affect the MySQL preferences - """ - self.config.write(sys.stdout) - -def recommend_my_cnf(known_loc = None): - if known_loc is None: - for loc in ["/usr/etc/my.cnf","/etc/mysql/my.cnf","/etc/my.cnf"]: - if os.path.exists(loc): - known_loc = loc - if known_loc is None: - raise FileNotFoundError("Could not find MySQL folder: pass one.") - cnf = Configfile(usertype = 'admin', possible_locations = [known_loc]) - cnf.set_bookworm_options() - cnf.write_out() diff --git a/bookwormDB/convertTSVtoJSONarray.py b/bookwormDB/convertTSVtoJSONarray.py deleted file mode 100644 index 3573f44..0000000 --- a/bookwormDB/convertTSVtoJSONarray.py +++ /dev/null @@ -1,28 +0,0 @@ -import json - -def convertToJSON(filename, location): - """ - given a filename of a tsv, converts that into an ndjson - file for Bookworm. - """ - input = open(filename) - output = open(location, "w") - headers = input.readline() - headers = headers.rstrip("\n") - headers = headers.rstrip("\r") - headers = headers.rstrip("\n") - headers = headers.rstrip("\r") - headers = headers.split("\t") - for line in input: - line = line.rstrip("\n") - line = line.rstrip("\r") - line = line.rstrip("\n") - line = line.rstrip("\r") - values = line.split("\t") - myobject = dict(list(zip(headers,values))) - output.write(json.dumps(myobject) + "\n") - output.close() - - - - diff --git a/bookwormDB/countManager.py b/bookwormDB/countManager.py index 0e87b4f..e69de29 100644 --- a/bookwormDB/countManager.py +++ b/bookwormDB/countManager.py @@ -1,170 +0,0 @@ -import sys -import os -import bounter -from collections import Counter -from .tokenizer import Tokenizer, tokenBatches, PreTokenized -from multiprocessing import Process, Queue, Pool -from .multiprocessingHelp import mp_stats, running_processes -import multiprocessing as mp -import psutil -import queue -import logging -import fileinput -import time -import csv - -cpus, memory = mp_stats() - - -# Allocate half of available memory for the bounter, in megabytes. -memory = int(memory/1024/1024/2) - -# Use another third of the memory for storing worker counts; divided -# by number of CPUS. -# Assume 200 bytes per entry in python dict. - -QUEUE_POST_THRESH = int(memory / 3 * 1024 * 1024 / 200 / cpus) -logging.debug("Ideal queue size is {}".format(QUEUE_POST_THRESH)) -QUEUE_POST_THRESH = max([100000, QUEUE_POST_THRESH]) - -logging.info("Filling dicts to size {}".format(QUEUE_POST_THRESH)) - -import random -import gzip - -def flush_counter(counter, qout): - for k in ['', '\x00']: - try: - del counter[k] - except KeyError: - continue - qout.put(counter) - -def counter(qout, i, fin, mode = "count"): - """ - # Counts words exactly in a separate process. - # It runs in place. - If mode is 'encode', this is called for a side-effect of writing - files to disk. - """ - - totals = 0 - errors = 0 - - if mode == "count": - counter = Counter() - encoder = tokenBatches(['words']) - - if mode == "encode": - encoder = tokenBatches(['unigrams', 'bigrams']) - - datatype = "raw" - - count_signals = [".unigrams", ".bigrams", ".trigrams", ".quadgrams"] - for signal in count_signals: - if signal in fin: - datatype = signal.strip(".") - if mode == "encode": - encoder = tokenBatches([datatype]) - - if (fin.endswith(".gz")): - fin = gzip.open(fin, 'rt') - else: - fin = open(fin) - - - for ii, row in enumerate(fin): - if ii % cpus != i: - # Don't do anything on most lines. - continue - totals += 1 - try: - (filename, text) = row.rstrip().split("\t",1) - except ValueError: - errors += 1 - continue - - if datatype == "raw": - tokenizer = Tokenizer(text) - else: - tokenizer = PreTokenized(text, encoder.levels[0]) - - # When encoding - if mode == "encode": - encoder.encodeRow(filename, tokenizer, write_completed = True) - continue - - # When building counts - counter.update(tokenizer.counts("words")) - - # When the counter is long, post it to the master and clear it. - if len(counter) > QUEUE_POST_THRESH: - flush_counter(counter=counter, qout = qout) - counter = Counter() - - # Cleanup. - if mode == "count": - logging.debug("Flushing leftover counts from thread {}".format(i)) - flush_counter(counter=counter, qout = qout) - if totals > 0 and errors/totals > 0.01: - logging.warning("Skipped {} rows without tabs".format(errors)) - if mode == "encode": - encoder.close() - -def create_counts(input): - qout = Queue(cpus * 2) - workers = [] - logging.info("Spawning {} count processes on {}".format(cpus, input)) - for i in range(cpus): - p = Process(target = counter, args = (qout, i, input, "count")) - p.start() - workers.append(p) - - wordcounter = bounter.bounter(memory) - - while True: - - try: - input_dict = qout.get_nowait() - logging.debug("inputting queue of length {} from worker".format(len(input_dict))) - wordcounter.update(input_dict) - - except queue.Empty: - if running_processes(workers): - time.sleep(1/100) - else: - break - except ValueError: - for k, v in input_dict.items(): - print("'{}'\t'{}'".format(k, v)) - wordcounter.update({k: v}) - raise - except TypeError: - for k, v in input_dict.items(): - print("'{}'\t'{}'".format(k, v)) - wordcounter.update({k: v}) - raise - - return wordcounter - -def create_wordlist(n, input, output): - - counter = create_counts(input) - counter = sorted(list(counter.iteritems()), key = lambda x: -1 * x[1]) - output = open(output, "w") - for i, (k, v) in enumerate(counter): - output.write("{}\t{}\t{}\n".format(i, k, v)) - if i >= n: - break - -def encode_words(wordlist, input = "input.txt"): - qout = Queue(cpus * 2) - workers = [] - - for i in range(cpus): - p = Process(target = counter, args = (qout, i, input, "encode")) - p.start() - workers.append(p) - - while running_processes(workers): - time.sleep(1/30) diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py new file mode 100644 index 0000000..7859539 --- /dev/null +++ b/bookwormDB/duckdb.py @@ -0,0 +1,642 @@ +#!/usr/local/bin/python + +from .search_limits import Search_limits, where_from_hash +from .bwExceptions import BookwormException +from .DuckSchema import DuckSchema +import json +import re +import copy +import hashlib +import logging +logger = logging.getLogger("bookworm") + +""" +APOLOGIA: + +This is the oldest part of the code base. There are a lot of places +where I didn't know how to do things yet, and probably more unused functions +than elsewhere. + +""" + +def fail_if_nonword_characters_in_columns(input): + keys = all_keys(input) + for key in keys: + if re.search(r"[^A-Za-z_$*0-9]", key): + logger.error("{} has nonword character".format(key)) + raise + +def all_keys(input): + """ + Recursive function. Get every keyname in every descendant of a dictionary. + Iterates down on list and dict structures to search for more dicts with + keys. + """ + values = [] + if isinstance(input, dict): + values = list(input.keys()) + for key in list(input.keys()): + values = values + all_keys(input[key]) + if isinstance(input, list): + for value in input: + valleys = all_keys(value) + for val in valleys: + values.append(val) + return values + +# The basic object here is a 'Query:' it takes dictionary as input, +# as defined in the API, and returns a value +# via the 'execute' function whose behavior +# depends on the mode that is passed to it. +# Given the dictionary, it can return a number of objects. +# The "Search_limits" array in the passed dictionary determines how many +# elements it returns; this lets multiple queries be bundled together. +# Most functions describe a subquery that might be combined into one big query +# in various ways. + +def check_query(query): + if query['method'] in ["schema", "search"]: + # Queries below this only apply to "data" + return + for v in query['counttype']: + if not v in ['WordCount', 'TextCount']: + raise BookwormException({"code": 400, "message": 'Only "WordCount" and "TextCount"' + ' counts are supported by the SQL api, but passed {}'.format(v)}) + +class DuckQuery(object): + """ + The base class for a bookworm search. + """ + def __init__(self, query_object = {}, db = None, databaseScheme = None): + # Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it. + + check_query(query_object) + self.prefs = {} + self.query_object = query_object + self._db = db + + self.databaseScheme = databaseScheme + if databaseScheme is None: + self.databaseScheme = DuckSchema(self.db) + if query_object['method'] == 'schema': + return + self._wordswhere = None + self.words = "words" + self.defaults() # Take some defaults + self.derive_variables() # Derive some useful variables that the query will use. + self.set_operations() + self._groups = None + + @property + def db(self): + if self._db is None: + raise TypeError("Must supply database.") + else: + return self._db + + @property + def method(self): + return self.query_object['method'] + + def defaults(self): + # these are default values;these are the only values that can be set in the query + # search_limits is an array of dictionaries; + # each one contains a set of limits that are mutually independent + # The other limitations are universal for all the search limits being set. + + query_object = self.query_object + + self.wordsTables = None + # Set up a dictionary for the denominator of any fraction if it doesn't already exist: + self.search_limits = query_object['search_limits'] + self.words_collation = query_object.get('words_collation', "Case_Sensitive") + + lookups = { + "Case_Insensitive":'lowercase', + 'lowercase':'lowercase', + 'casesens':'word', + "case_insensitive":"lowercase", + "Case_Sensitive":"word", + 'stem':'stem'} + try: + self.word_field = lookups[self.words_collation] + except KeyError: + self.word_field = 'word' + + """ + @property + def groups(self): + if self._groups: + return self_groups + self.groups = set() + self.outerGroups = [] + self.finalMergeTables = set() + + try: + groups = query_object['groups'] + except: + groups = [] + + for group in groups: + + # There's a special set of rules for how to handle unigram and bigrams + multigramSearch = re.match("(unigram|bigram|trigram)([1-4])?", group) + + if multigramSearch: + if group == "unigram": + gramPos = "1" + gramType = "unigram" + + else: + gramType = multigramSearch.groups()[0] + try: + gramPos = multigramSearch.groups()[1] + except: + print("currently you must specify which bigram element you want (eg, 'bigram1')") + raise + + lookupTableName = "%sLookup%s" %(gramType, gramPos) + self.outerGroups.append(f"`{lookupTableName}`.`{self.word_field}` as {group}") + self.finalMergeTables.add(" JOIN %s as %s ON %s.wordid=w%s" %(self.wordsheap, lookupTableName, lookupTableName, gramPos)) + self.groups.add("words%s.wordid as w%s" %(gramPos, gramPos)) + + else: + self.outerGroups.append(group) + try: + if self.databaseScheme.aliases[group] != group: + # Search on the ID field, not the basic field. + # debug(self.databaseScheme.aliases.keys()) + self.groups.add(self.databaseScheme.aliases[group]) + table = self.databaseScheme.tableToLookIn[group] + + joinfield = self.databaseScheme.aliases[group] + self.finalMergeTables.add(f' JOIN "{table}" USING ("{joinfield}")') + else: + self.groups.add('"' + group + '"') + except KeyError: + self.groups.add('"' + group + '"') + + "" + There are the selections which can include table refs, and the groupings, which may not: + and the final suffix to enable fast lookup + "" + + self.selections = ",".join(self.groups) + self.groupings = ",".join([group for group in self.groups]) + + self.joinSuffix = "" + " ".join(self.finalMergeTables) + + self.counttype = query_object['counttype'] + if isinstance(self.counttype, (str)): + self.counttype = [self.counttype] + """ + @property + def word_limits(self): + if 'word' in self.limits: + return True + else: + return False + + def derive_variables(self): + # These are locally useful, and depend on the search limits put in. + self.limits = self.search_limits + + # Treat empty constraints as nothing at all, not as restricting to the set of nothing. + for key in list(self.limits.keys()): + if self.limits[key] == []: + del self.limits[key] + self.set_operations() +# self.create_catalog_table() + + @property + def wordid_query(self): + return self.wordswhere + + if self.wordswhere != " TRUE ": + f = "SELECT wordid FROM { words } as words1 WHERE { wordswhere }".format(**self.__dict__) + logger.debug("`" + self.wordswhere + "`") + return " wordid IN ({})".format(f) + else: + return " TRUE " + + def time_rounding(self, fieldname): + if not 'date_resolution' in self.query_object: + return ['year'] + elif isinstance(self.query_object['date_resolution'], (str)): + return self.query_object['date_resolution'] + elif fieldname in self.query_object['date_resolution']: + return self.query_object['date_resolution'][fieldname] + else: + return ['year', 'month'] + + def make_group_query(self): + # Based on groups, determine what the groupings are, and the selections including alises. + # Includes date parsing. + + aliases = [] + fields = [] + for g in self.query_object["groups"]: + wrapped = f'"{g}"' + if g in self.databaseScheme.aliases: + alias = f'"{self.databaseScheme.aliases[g]}"' +# aliases.append(alias) + aliases.append(f"FIRST({wrapped}) as {wrapped}") + fields.append(alias) + elif self.databaseScheme.records[g]['dtype'].startswith("date"): + resolutions = set(self.time_rounding(g)) + date_exprs = [] + for res in ['year', 'month', 'day']: + if res in resolutions: + n = 2 + if res == 'year': + n = 4 + date_exprs.append(f'''FIRST(LPAD({res}("{g}")::char, {n}, '0'))''') + fields.append(f'{res}("{g}")') + else: + if res == 'year': + date_exprs.append("'-'") + # year needs a leading dash "--12-07", though support is rare + aliases.append(" || ".join(date_exprs) + f'AS {wrapped}') # Concatenate strings. + else: + aliases.append(wrapped) + fields.append(wrapped) + + grouping = " " + if len(self.query_object["groups"]) > 0: + grouping = "GROUP BY " + ", ".join(fields) + return aliases, grouping + + def main_table(self): + if self.gram_size() == 1: + return '"unigram__ncid" as main' + if self.gram_size() == 2: + return '"bigram__ncid" as main' + if self.gram_size() == 3: + return '"trigram__ncid" as main' + + def full_query_tables(self): + # Joins are needed to provide groups, but *not* to provide + # provide evidence for wheres. + + # But if there's a group, there may also need to be an associated where. + + if self.word_limits: + tables = [self.main_table()] + else: + tables = [] + cols = self.query_object['groups'] + s_keys = [k for k in pull_keys(self.limits) if not k in {"word", "unigram", "bigram", "trigram"}] + + enquoted = [f'"{tb}"' for tb in self.databaseScheme.tables_for_variables(cols + s_keys)] + + tabs = tables + enquoted + if len(enquoted) == 0: + tabs.append('"fastcat"') + if self.using_nwords and not '"fastcat"' in tabs: + tabs.append('"fastcat"') + return tabs + + @property + def query_tables(self): + tables = self.full_query_tables() + return " NATURAL JOIN ".join(tables) + + def base_query(self): + group_aliases, group_object = self.make_group_query() + return f""" + SELECT {', '.join(self.set_operations() + group_aliases)} + FROM {self.query_tables} + WHERE + {self._ncid_query()} + AND + {self.wordid_query} + AND + {self.catwhere} + {group_object} + """ + + @property + def catalog_table(self): + # NOT USED + # self.catalog = self.prefs['fastcat'] # 'catalog' # Can be replaced with a more complicated query in the event of longer joins. + + """ + + This should check query constraints against a list of tables, and + join to them. So if you query with a limit on LCSH, and LCSH + is listed as being in a separate table, it joins the table + "LCSH" to catalog; and then that table has one column, ALSO + called "LCSH", which is matched against. This allows a _ncid + to be a member of multiple catalogs. + + """ + + self.relevantTables = set() + + databaseScheme = self.databaseScheme + + cols = self.needed_columns() + + cols = [c for c in cols if not c in {"word", "word1", "word2", "word3", "word4"}] + if self.using_nwords: + cols.append("nwords") + print("\n\n", cols, "\n\n") + self.relevantTables = self.databaseScheme.tables_for_variables(cols) + + self.catalog = " NATURAL JOIN ".join(self.relevantTables) + return self.catalog + + @property + def catwhere(self): + # Where terms that don't include the words table join. + catlimits = dict() + + for key in list(self.limits.keys()): + if key not in ('word', 'word1', 'word2', 'word3', 'hasword') and not re.search("words[0-5]", key): + catlimits[key] = self.limits[key] + + if len(list(catlimits.keys())) > 0: + return where_from_hash(catlimits) + else: + return "TRUE" + + def gram_size(self): + try: + ls = [phrase.split() for phrase in self.limits['word']] + except: + return 0 + lengths = list(set(map(len, ls))) + if len(lengths) > 1: + raise BookwormException('400', 'Must pass all unigrams or all bigrams') + else: + return lengths[0] + + @property + def wordswhere(self): + if self._wordswhere: + return self._wordswhere + + if not self.word_limits: + self._wordswhere = " TRUE " + return " TRUE " + + limits = [] + + """ + This doesn't currently allow mixing of one and two word searches. + """ + + collation = self.query_object.get('words_collation', 'Case_Sensitive') + word_field = "word" + for phrase in self.limits['word']: + locallimits = dict() + array = phrase.split(" ") + for n, word in enumerate(array): + searchingFor = word + if collation == "stem": + from nltk import PorterStemmer + searchingFor = PorterStemmer().stem_word(searchingFor) + word_field = "stem" + if collation == "case_insensitive" or \ + collation == "Case_Insensitive": + # That's a little joke. Get it? + searchingFor = searchingFor.lower() + print(searchingFor) + word_field = "lowercase" + + + selectString = f"SELECT wordid FROM wordsheap WHERE \"{word_field}\" = '{searchingFor}'" + logger.warning(selectString) + self.db.execute(selectString) + + # Set the search key being used. + search_key = "wordid" + if self.gram_size() > 1: + # 1-indexed entries in the bigram tables. + search_key = f"word{n + 1}" + + for row in self.db.fetchall(): + wordid = row[0] + try: + locallimits[search_key] += [wordid] + except KeyError: + locallimits[search_key] = [wordid] + + if len(locallimits) > 0: + limits.append(where_from_hash(locallimits, comp = " = ", escapeStrings=False)) + + + self._wordswhere = "(" + ' OR '.join(limits) + ")" + if limits == []: + # In the case that nothing has been found, tell it explicitly to search for + # a condition when nothing will be found. + self._wordswhere = "_ncid = -1" + + wordlimits = dict() + + limitlist = copy.deepcopy(list(self.limits.keys())) + + for key in limitlist: + if re.search("words[0-5]", key): + wordlimits[key] = self.limits[key] + self.max_word_length = max(self.max_word_length, 2) + del self.limits[key] + print(self._wordswhere) + if len(list(wordlimits.keys())) > 0: + self._wordswhere = where_from_hash(wordlimits) + + return self._wordswhere + + def build_wordstables(self): + # Deduce the words tables we're joining against. + # The iterating on this can be made more general to get 3 or four grams in pretty easily. + # This relies on a determination already having been made about whether + # this is a unigram or bigram search; that's reflected in the self.selections + # variable. + + if self.wordsTables is not None: + return + + needsBigrams = (self.max_word_length == 2 or re.search("words2", self.selections)) + + needsUnigrams = self.max_word_length == 1; + + if self.max_word_length > 2: + err = dict(code=400, message="Phrase is longer than what Bookworm currently supports") + raise BookwormException(err) + + if needsBigrams: + self.main = ''' + bigrams__ncid as main + ''' + + self.wordstables = """ + JOIN %(wordsheap)s as words1 ON (main.word1 = words1.wordid) + JOIN %(wordsheap)s as words2 ON (main.word2 = words2.wordid) """ % self.__dict__ + + # I use a regex here to do a blanket search for any sort of word limitations. That has some messy sideffects (make sure the 'hasword' + # key has already been eliminated, for example!) but generally works. + + elif needsUnigrams: + self.main = ''' + unigram__ncid as main + ''' + + self.wordstables = """ + JOIN ( %(wordsheap)s as words1) ON (main.wordid = words1.wordid) + """ % self.__dict__ + + else: + """ + Have _no_ words table if no words searched for or grouped by; + instead just use nwords. This + means that we can use the same basic functions both to build the + counts for word searches and + for metadata searches, which is valuable because there is a + metadata-only search built in to every single ratio + query. (To get the denominator values). + + Call this OLAP, if you like. + """ + self.main = " " + self.operation = ','.join(self.set_operations(with_words = False)) + """ + This, above is super important: the operation used is relative to the counttype, and changes to use 'catoperation' instead of 'bookoperation' + That's the place that the denominator queries avoid having to do a table scan on full bookcounts that would take hours, and instead takes + milliseconds. + """ + self.wordstables = " " + self.wordswhere = " TRUE " + # Just a dummy thing to make the SQL writing easier. Shouldn't take any time. Will usually be extended with actual conditions. + + def set_operations(self): + + with_words = self.word_limits + output = [] + self.using_nwords = False + if with_words: + if "TextCount" in self.query_object['counttype']: + output.append("count(DISTINCT main._ncid) as 'TextCount'") + if "WordCount" in self.query_object['counttype']: + output.append("sum(main.count) as 'WordCount'") + else: + self.using_nwords = True + if "WordCount" in self.query_object['counttype']: + output.append("sum(nwords) as 'WordCount'") + if "TextCount" in self.query_object['counttype']: + output.append("count(DISTINCT _ncid) as 'TextCount'") + + return output + + def _ncid_query(self): + + q = f""" {self.catwhere} """ + logger.debug("'{}'".format(self.catwhere)) + if self.catwhere == "TRUE": + self._ncid_where = " TRUE " + else: + self._ncid_where = q + return self._ncid_where + + def query(self): + + """ + Return the SQL query that fills the API request. + + There must be a search method filled out. + """ + + if (self.query_object['method'] == 'schema'): + return "SELECT name,type,description,tablename,dbname,anchor FROM masterVariableTable WHERE status='public'" + elif (self.query_object['method'] == 'search'): + return self.bibliography_query() + elif self.query_object['method'] == 'data': + return self.base_query() + else: + raise BookwormException('400', 'Must enter "schema", "search", or "data" as method') + + + def bibliography_query(self, limit = "100"): + # I'd like to redo this at some point so it could work as an API call more naturally. + self.limit = limit + self.ordertype = "sum(main.count*10000/nwords)" + try: + if self.query_object['ordertype'] == "random": + if self.counttype in [ + "WordCount" + ]: + self.ordertype = "RAND()" + else: + # This is a based on an attempt to match various + # different distributions I found on the web somewhere to give + # weighted results based on the counts. It's not perfect, but might + # be good enough. Actually doing a weighted random search is not easy without + # massive memory usage inside sql. + self.ordertype = "RAND()" + # self.ordertype = "LOG(1-RAND())/sum(main.count)" + except KeyError: + pass + + # If IDF searching is enabled, we could add a term like '*IDF' here to overweight better selecting words + # in the event of a multiple search. + self.idfterm = "" + prep = self.base_query() + + dicto = { + 'tables': self.make_join_query(), + 'ordertype': self.ordertype, + 'catwhere': self.catwhere, + 'limit': limit + } + + dicto['_ncid_where'] = self._ncid_query() + dicto['wordid_where'] = self.wordid_query + + bibQuery = """ + SELECT searchstring + FROM catalog RIGHT JOIN ( + SELECT {fastcat}._ncid, {ordertype} as ordering + FROM + {tables} + WHERE + {_ncid_where} AND {wordid_where} and {catwhere} + GROUP BY _ncid ORDER BY {ordertype} DESC LIMIT {limit} + ) as tmp USING (_ncid) ORDER BY ordering DESC; + """.format(**dicto) + return bibQuery + + def search_results(self): + # This is an alias that is handled slightly differently in + # APIimplementation (no "RESULTS" bit in front). Once + # that legacy code is cleared out, they can be one and the same. + + return json.loads(self.return_books()) + + def getActualSearchedWords(self): + # + if len(self.wordswhere) > 7: + words = self.query_object['search_limits']['word'] + # Break bigrams into single words. + words = ' '.join(words).split(' ') + q = "SELECT word FROM {} WHERE {}".format(self.wordsheap, where_from_hash({self.word_field:words})) + logger.debug(q) + self.db.execute(q) + self.actualWords = [item[0] for item in self.db.fetchall()] + else: + raise TypeError("Suspiciously low word count") + +def pull_keys(entry): + val = [] + if isinstance(entry, list) and not isinstance(entry, (str, bytes)): + for element in entry: + val += pull_keys(element) + elif isinstance(entry, dict): + for k,v in entry.items(): + if k[0] != "$": + val.append(k) + else: + val += pull_keys(v) + else: + return [] + + return [key for key in val] diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 5ce5675..68785d0 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -1,29 +1,32 @@ #!/usr/bin/python -from pandas import merge -from pandas import Series +from pandas import merge, Series, set_option, DataFrame from pandas.io.sql import read_sql -from pandas import merge -from pandas import set_option +import pandas as pd +from pyarrow import feather from copy import deepcopy from collections import defaultdict -from .SQLAPI import DbConnect -from .SQLAPI import userquery -from .mariaDB import Query +from .duckdb import DuckQuery from .bwExceptions import BookwormException +from .query_cache import Query_Cache import re import json import logging +logger = logging.getLogger("bookworm") + import numpy as np import csv import io import numpy as np +from urllib import request +from urllib import parse +import random """ The general API is some functions for working with pandas to calculate bag-of-words summary statistics according to the API description. -It is not bound to any particular backend: instead, a subset of +It is not bound to any particular backend: instead, a subset of methods in the API must be supported by subclassing APICall(). The only existing example of this is "SQLAPICall." @@ -99,49 +102,49 @@ class Aggregator(object): but there are a multitude of things you can do with those: basic things like frequency, all the way up to TF-IDF. - """ + """ def __init__(self, df, groups = None): self.df = df self.groups = groups def _aggregate(self, parameters): "Run the aggregation. Prefixed with an underscore so it doesn't show up in the dict." - + parameters = set(map(str, parameters)) for parameter in parameters: getattr(self, parameter)() return self.df - + def WordCount(self): self.df["WordCount"] = self.df["WordCount_x"] - + def TextCount(self): self.df["TextCount"] = self.df["TextCount_x"] - + def WordsPerMillion(self): self.df["WordsPerMillion"] = (self.df["WordCount_x"].multiply(1000000)/ self.df["WordCount_y"]) def TotalWords(self): self.df["TotalWords"] = self.df["WordCount_y"] - + def SumWords(self): self.df["SumWords"] = self.df["WordCount_y"] + self.df["WordCount_x"] - + def WordsRatio(self): self.df["WordsRatio"] = self.df["WordCount_x"]/self.df["WordCount_y"] - + def TextPercent(self): self.df["TextPercent"] = 100*self.df["TextCount_x"].divide(self.df["TextCount_y"]) - + def TextRatio(self): - self.df["TextRatio"] = self.df["TextCount_x"]/self.df["TextCount_y"] + self.df["TextRatio"] = self.df["TextCount_x"]/self.df["TextCount_y"] def TotalTexts(self): self.df["TotalTexts"] = self.df["TextCount_y"] - + def SumTexts(self): self.df["SumTexts"] = self.df["TextCount_y"] + self.df["TextCount_x"] - + def HitsPerText(self): self.df["HitsPerText"] = self.df["WordCount_x"]/self.df["TextCount_x"] @@ -152,13 +155,13 @@ def PMI_words(self): self.df["PMI_words"] = PMI(self.df, "WordCount_x", self.groups) def PMI_texts(self): - self.df["PMI_texts"] = PMI(self.df, "TextCount_x", self.groups) - + self.df["PMI_texts"] = PMI(self.df, "TextCount_x", self.groups) + def TFIDF(self): from numpy import log as log self.df["TF"] = self.df["WordCount_x"]/self.df["WordCount_y"] self.df["TFIDF"] = self.df["TF"] * np.log(self.df["TextCount_y"]/self.df['TextCount_x']) - + def Dunning(self): self.df["Dunning"] = DunningLog(self.df, "WordCount_x", "WordCount_y") @@ -167,7 +170,7 @@ def DunningTexts(self): self.df["DunningTexts"] = DunningLog(self.df, "TextCount_x", "TextCount_y") def rename(df, newkey): - + # Add "x" and "y" suffixed to the dataframes even when not explicitly needed. renamer = {} @@ -198,6 +201,16 @@ def need_comparison_query(count_types): needing_fields = [c for c in count_types if not c in ["WordCount","TextCount"]] return len(needing_fields) != 0 +def dates_to_iso(frame): + + for column in frame.columns: + if "date" in str(frame[column].dtype): + frame[column] = frame[column].apply(lambda x: x.isoformat()) + else: + print(str(frame[column].dtype)) + return frame + + def base_count_types(list_of_final_count_types): """ the final count types are calculated from some base types across both @@ -209,7 +222,7 @@ def base_count_types(list_of_final_count_types): subq = set() superq = set() - + for count_name in list_of_final_count_types: if count_name in ["WordCount", "WordsPerMillion", "WordsRatio", "TotalWords", "SumWords", "Dunning", "PMI_words", "TextLength", "HitsPerMatch", "TFIDF"]: @@ -237,16 +250,26 @@ class APIcall(object): Without a "return_pandas_frame" method, it won't run. """ - def __init__(self, APIcall): + def __init__(self, query): """ Initialized with a dictionary unJSONed from the API defintion. """ - self.query = APIcall + self.query = query self.idiot_proof_arrays() self.set_defaults() + + def clone(self, query): + """ + Make a clone of the APIcall object. + Used with multipart queries. + + Should be sure that query itself is deeply cloned. + """ + return APIcall(query) + def set_defaults(self): query = self.query if "search_limits" not in query: @@ -257,10 +280,12 @@ def set_defaults(self): query["search_limits"]["word"] = query["search_limits"]["unigram"] del query["search_limits"]["unigram"] + + def idiot_proof_arrays(self): for element in ['counttype', 'groups']: try: - if not isinstance(self.query[element], list): + if isinstance(self.query[element], str): self.query[element] = [self.query[element]] except KeyError: # It's OK if it's not there. @@ -308,35 +333,39 @@ def data(self): self.pandas_frame = self.get_data_from_source() return self.pandas_frame + #@attr + #data_frame(self): + # if self._pandas_frame is not None: + # return self.return_pandas_frame + + def validate_query(self): self.ensure_query_has_required_fields() - + def ensure_query_has_required_fields(self): required_fields = ['counttype', 'groups', 'database'] - if self.query['method'] in ['schema', 'search']: + if self.query['method'] in ['schema', 'search', 'returnPossibleFields']: required_fields = ['database'] - + for field in required_fields: if field not in self.query: - logging.error("Missing field: %s" % field) + logger.error("Missing field: %s" % field) err = dict(message="Bad query. Missing \"%s\" field" % field, code=400) raise BookwormException(err) def prepare_search_and_compare_queries(self): - - call1 = deepcopy(self.query) call2 = deepcopy(call1) call2['search_limits'] = self.get_compare_limits() - + # The individual calls need only the base counts: not "Percentage of # Words," but just "WordCount" twice, and so forth call1['counttype'], call2['counttype'] = base_count_types(self.query['counttype']) - + # Drop out asterisks for that syntactic sugar. for limit in list(call1['search_limits'].keys()): if re.search(r'^\*', limit): @@ -363,14 +392,14 @@ def get_data_from_source(self): instance or something else, just by changing the bits in the middle where it handles storage_format. """ - + self.validate_query() if self.query['method'] in ['schema', 'search']: return self.generate_pandas_frame() - + self.prepare_search_and_compare_queries() - + """ This could use any method other than pandas_SQL: You'd just need to redefine "generate_pandas_frame" @@ -384,10 +413,10 @@ def get_data_from_source(self): try: df1 = self.generate_pandas_frame(self.call1) rename(df1, "x") - logging.debug(self.call2) + logger.debug(self.call2) df2 = self.generate_pandas_frame(self.call2) rename(df2, "y") - + except Exception as error: logging.exception("Database error") # One common error is putting in an inappropriate column @@ -401,14 +430,14 @@ def get_data_from_source(self): except: return Series({"status": "error", "message": "Unknown error. ", - "code":str(error)}) - + "code":str(error)}) + intersections = intersectingNames(df1, df2) """ Would this merge be faster with indexes? """ - + if len(intersections) > 0: merged = merge(df1, df2, on=intersections, how='outer') else: @@ -420,7 +449,7 @@ def get_data_from_source(self): gator = Aggregator(merged, self.query['groups']) calcced = gator._aggregate(calculations) # calcced = calculateAggregates(merged, calculations, self.query['groups']) - + calcced = calcced.fillna(int(0)) final_DataFrame = (calcced[self.query['groups'] + @@ -431,135 +460,98 @@ def get_data_from_source(self): def execute(self): method = self.query['method'] - logging.debug("Preparing to execute with method '{}'".format(method)) fmt = self.query['format'] if 'format' in self.query else False + if not 'method' in self.query: + return "You must pass a method to the query." + if method=="returnPossibleFields": + self.query['method'] = "schema" + method = "schema" - if method == 'data' or method == 'schema' or method == 'search': - version = 2 - if fmt in ['json_c', 'search', 'html', 'csv', 'tsv']: - version = 3 - else: - version = 1 - - if version == 1: + version = 3 + try: # What to do with multiple search_limits + if isinstance(self.query['search_limits'], list): - if method in ["json", "return_json"]: - self.query['method'] = 'data' - self.query['format'] = 'json' - return self.multi_execute(version=version) + if fmt == "json" or version >= 3: + frame = self.multi_execute(version = version) else: # Only return first search limit if not return in json self.query['search_limits'] = self.query['search_limits'][0] - - form = method[7:] if method[:6] == 'return' else method - - logging.warning("method == \"%s\" is deprecated. Use method=\"data\" " - "with format=\"%s\" instead." % (method, form)) + else: + frame = self.data() - if method == "return_json" or method == "json": - self.query['method'] = 'data' - self.query['format'] = 'json' - return self.return_json(version=1) + if fmt == "json": + val = dates_to_iso(frame) + val = val.where(pd.notnull(val), None) + val.replace([np.inf, -np.inf], None, inplace=True) + val = val.to_dict(orient = "records") + return self._prepare_response(val, version = 2) - elif method == "return_csv" or method == "csv": - self.query['method'] = 'data' - self.query['format'] = 'json' - frame = self.data() - return frame.to_csv(path = None, sep="\t", encoding="utf8", index=False, - quoting=csv.QUOTE_NONE, escapechar="\\") - elif version >= 2: - try: - # What to do with multiple search_limits - - if isinstance(self.query['search_limits'], list): - if fmt == "json" or version >= 3: - frame = self.multi_execute(version = version) - else: - # Only return first search limit if not return in json - self.query['search_limits'] = self.query['search_limits'][0] - else: - frame = self.data() - - if fmt == "json": - return self.return_json(version=2) - - if fmt == "csv": - return frame.to_csv(encoding="utf8", index=False) - - if fmt == "tsv": - return frame.to_csv(sep="\t", encoding="utf8", index=False) - - if fmt == "feather": - fout = io.BytesIO(b'') - try: - frame.to_feather(fout) - except: - logging.warning("You need the pyarrow package installed to export as feather.") - raise - fout.seek(0) - return fout.read() + if fmt == "csv": + return frame.to_csv(encoding="utf8", index=False) - if fmt == 'json_c': - return self.return_rle_json(frame) + if fmt == "tsv": + return frame.to_csv(sep="\t", encoding="utf8", index=False) - if fmt == 'html': - return self.html(frame) - - else: - err = dict(status="error", code=200, - message="Only formats in ['csv', 'tsv', 'json', 'feather']" - " currently supported") - return json.dumps(err) - except BookwormException as e: - # Error status codes are HTTP codes - # http://www.restapitutorial.com/httpstatuscodes.html - err = e.args[0] - err['status'] = "error" - return json.dumps(err) - except Exception as ex: - # General Uncaught error. - logging.exception("{}".format(ex)) - logging.exception("Database error") - return json.dumps({"status": "error", "message": "Database error. " - "Try checking field names."}) - - # Temporary catch-all pushes to the old methods: - if method in ["returnPossibleFields", "search_results", - "return_books", "schema"]: + if fmt == "feather" or fmt == "feather_js": + compression = "zstd" + if fmt == "feather_js": + compression = "uncompressed" + fout = io.BytesIO(b'') try: - query = userquery(self.query) - if method == "return_books": - return query.execute() - return json.dumps(query.execute()) - except Exception as e: - if len(str(e)) > 1 and e[1].startswith("Unknown database"): - return "No such bookworm {}".format(e[1].replace("Unknown database","")) + feather.write_feather(frame, fout, compression = compression) except: - return "General error" + logger.error("You need the pyarrow package installed to export as feather.") + raise + fout.seek(0) + return fout.read() + + if fmt == 'json_c': + return self.return_rle_json(frame) + + if fmt == 'html': + return self.html(frame) + + else: + err = dict(status="error", code=200, + message="Only formats in ['csv', 'tsv', 'json', 'feather']" + " currently supported") + return json.dumps(err) + except BookwormException as e: + # Error status codes are HTTP codes + # http://www.restapitutorial.com/httpstatuscodes.html + err = e.args[0] + err['status'] = "error" + return json.dumps(err) + except Exception as ex: + # General Uncaught error. + logging.exception("{}".format(ex)) + logging.exception("Database error") + return json.dumps({"status": "error", "message": "Database error. " + "Try checking field names."}) def multi_execute(self, version=1): - + """ Queries may define several search limits in an array if they use the return_json method. """ - + if version <= 2: returnable = [] for limits in self.query['search_limits']: child = deepcopy(self.query) child['search_limits'] = limits - q = self.__class__(child).return_json(raw_python_object=True, - version=version) + q = self.clone(child).return_json(raw_python_object=True, + version=version) returnable.append(q) return self._prepare_response(returnable, version) - - if version == 3: + + if version >= 3: for i, limits in enumerate(self.query['search_limits']): child = deepcopy(self.query) child['search_limits'] = limits - f = self.__class__(child).data() + f = self.clone(child).data() f['Search'] = i if i == 0: frame = f @@ -567,40 +559,18 @@ def multi_execute(self, version=1): frame = frame.append(f, ignore_index = True) return frame - def html(self, data): """ - Return data in column-oriented format with run-length encoding - on duplicate values. + return an HTML table. """ if isinstance(data, Series) and 'status' in data: # If data has a status, Bookworm is trying to send us an error return data.to_json() - + set_option('display.max_colwidth', -1) return data.to_html(escape = False, index = False) - - def return_rle_json(self, data): - """ - Return data in column-oriented format with run-length encoding - on duplicate values. - """ - - if isinstance(data, Series) and 'status' in data: - # If data has a status, Bookworm is trying to send us an error - return data.to_json() - - output = {'status':'success', 'data':{}} - - for k in data: - series = data[k] - output['data'][k] = rle(data[k].tolist()) - - return json.dumps(output) - - def return_json(self, raw_python_object=False, version=1): ''' Get JSON data for a single search_limit. @@ -625,6 +595,7 @@ def fixNumpyType(input): # Define a recursive structure to hold the stuff. def tree(): return defaultdict(tree) + returnt = tree() for row in data.itertuples(index=False): @@ -643,7 +614,7 @@ def tree(): for r in row ] except: - logging.warning(row) + logger.warning(row) pass destination[key] = row break @@ -664,26 +635,50 @@ def _prepare_response(self, data, version=1): resp = dict(status="error", data="Internal error: unknown response version") - try: - return json.dumps(resp) - except ValueError: - return json.dumps(resp) + return json.dumps(resp) + +class MetaAPIcall(APIcall): + def __init__(self, endpoints): + self.endpoints = endpoints + super().__init__(self) + def connect(self, endpoint): + # return some type of a connection. + pass -class oldSQLAPIcall(APIcall): + def generate_pandas_frame(self, call): + if call is None: + call = deepcopy(self.query) + call['format'] = 'feather' + for endpoint in self.endpoints: + connection = self.connect(endpoint) + d = connection.query(call) + count_fields = [] + + for field in ['WordCount', 'TextCount']: + if field in call["counttype"]: + count_fields.push(field) + together = pd.concat(d) + together[count_fields].sum() + +class DuckDBCall(APIcall): + """ + Fetches from DuckDB. Must create a connection before passing, + to discourage on-the-fly creation which is slow. """ - To make a new backend for the API, you just need to extend the base API - call class like this. - This one is comically short because all the real work is done in the - userquery object. + def __init__(self, query, db): - But the point is, you need to define a function "generate_pandas_frame" - that accepts an API call and returns a pandas frame. + self.db = db + super().__init__(query) - But that API call is more limited than the general API; you only need to - support "WordCount" and "TextCount" methods. - """ + def clone(self, query): + """ + Make a clone of the object. + Used with multipart queries. + + """ + return DuckDBCall(query, db = self.db) def generate_pandas_frame(self, call = None): """ @@ -695,47 +690,117 @@ def generate_pandas_frame(self, call = None): more legacy code. """ - if call is None: call = self.query - - con = DbConnect(prefs, self.query['database']) - q = userquery(call).query() - df = read_sql(q, con.db) + q = DuckQuery(call, db = self.db) + if call['method'] == 'schema': + m = q.databaseScheme.to_pandas() + return m + query = q.query() + logger.warning("Preparing to execute {}".format(query)) + df = dates_to_iso(self.db.execute(query).df()) + logger.debug("Query retrieved") return df -class SQLAPIcall(APIcall): - """ - To make a new backend for the API, you just need to extend the base API - call class like this. - This one is comically short because all the real work is done in the - userquery object. +def my_sort(something): + if type(something) == list: + return sorted(something) + if type(something) == dict: + keys = list(something.keys()) + keys.sort() + output = {} + for k in keys: + output[k] = something[k] + return output + return something + +def standardized_query(query: dict) -> dict: + trimmed_call = {} + needed_keys = [ + 'search_limits', + 'compare_limits', + 'words_collation', + 'database', + 'method', + 'groups', + 'counttypes', + ] + needed_keys.sort() + for k in needed_keys: + try: + trimmed_call[k] = my_sort(query[k]) + except KeyError: + continue + return trimmed_call - But the point is, you need to define a function "generate_pandas_frame" - that accepts an API call and returns a pandas frame. - But that API call is more limited than the general API; you only need to - support "WordCount" and "TextCount" methods. +class ProxyAPI(APIcall): + """ - - def generate_pandas_frame(self, call = None): + Forward a request to a remote url. + + Can be useful if you want a proxy server with caching on one server which + reaches out to a different server for uncached requests, or perhaps + if you want a single gateway for multiple different bookworms. + + """ + + def __init__(self, query, endpoint): """ - - This is good example of the query that actually fetches the results. - It creates some SQL, runs it, and returns it as a pandas DataFrame. - - The actual SQL production is handled by the userquery class, which uses - more legacy code. - + Endpoint: A URL, like `http://localhost:10012`. + """ + self.endpoint = endpoint + super().__init__(query) + + def generate_pandas_frame(self, call = None) -> DataFrame: + """ + Note--requires that the endpoint expose the new feather method. """ + + if call is None: + call = self.query + call = deepcopy(call) + call['format'] = 'feather' + query_string = json.dumps(call) + qstring = parse.quote(query_string) + remote_url = f"{self.endpoint}/?{qstring}" + buffer = io.BytesIO() + connection = request.urlopen(remote_url) + buffer.write(connection.read()) + try: + return feather.read_feather(buffer) + except: + # TODO: re-throw bookworm errors with additional context. + raise +class Caching_API(APIcall): + def __init__(self, query: dict, cache: Query_Cache, fallback_api: APIcall, **kwargs): + """ + cache: an existing Query_Cache method. These are expensive to create, + so you don't get one generated by default. + + fallback_api: Must be initialized with a parent API class that also + inherits from APICall. + + kwargs: are passed to the fallback API. + """ + self.cache = cache + self.Fallback = fallback_api + self.kwargs = kwargs + super().__init__(query) + + def generate_pandas_frame(self, call = None) -> DataFrame: if call is None: call = self.query - con = DbConnect(prefs, self.query['database']) - q = Query(call).query() - logging.debug("Preparing to execute {}".format(q)) - df = read_sql(q, con.db) - logging.debug("Query retrieved") - return df - + + trimmed_call = standardized_query(call) + try: + return self.cache[trimmed_call] + except FileNotFoundError: + resolution = self.Fallback(call, **self.kwargs).generate_pandas_frame() + self.cache[trimmed_call] = resolution + if random.random() < .1: + # Don't bother doing this every time. + self.cache.trim_cache() + return resolution \ No newline at end of file diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index 11f7350..8ad32a3 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -1,12 +1,17 @@ -from __future__ import print_function import re -from subprocess import call -from subprocess import Popen -import logging + +from pathlib import Path import sys import os import bookwormDB import argparse +import json +import nonconsumptive as nc +from .store import store +import logging +import yaml +from nonconsumptive.commander import namespace_to_kwargs, add_builder_parameters +logger = logging.getLogger("bookworm") """ This is the code that actually gets run from the command-line executable. @@ -16,12 +21,6 @@ the run_arguments function pulls commands from the command line. Any useful new bookworm methods should be passed through run_arguments to work. - -Some modules, especially bookworm-specific ones, -are imported inline in the code here--that substantially -(as in, 1 second to 0.2 seconds) reduces startup time -for the command-line executable, -even though it's not best practice otherwise. """ class BookwormManager(object): @@ -39,25 +38,6 @@ def __init__(self, cnf_file=None, database=None): self.basedir = None self.dbname = None - for i in range(10): - basedir = "../"*i - if os.path.exists(basedir + ".bookworm"): - self.basedir = basedir - break - if self.basedir==None: - logging.debug("No bookworm directory found; hopefully this isn't a build call.") - - if cnf_file is not None: - config = configparser.ConfigParser(allow_no_value=True) - config.read([cnf_file]) - if config.has_section("client"): - """ - Silently go along if the config doesn't exist. - """ - try: - self.dbname = config.get("client", "database") - except configParser.NoOptionError: - pass # More specific options override the config file if database is not None: @@ -68,384 +48,48 @@ def config(self,args): """ Performs useful configuration tasks, such as setting up a MySQL installation. """ - if args.target=="mysql": - import bookwormDB.configuration - bookwormDB.configuration.recommend_my_cnf() - if args.target=="mysql-info": - from bookwormDB.configuration import Configfile - config = Configfile("admin") - print("The admin configuration login currently being used should be the following.\n") - config.write_out() if args.target=="apache": from bookwormDB.configuration import apache apache() - def ftokenize(self, args): - - import bookwormDB.tokenizer - - """ - Handle functions related to tokenization and encoding. - - Should eventually be able to accept arguments like "token-regex" - and already-tokenized documents. - """ - - if args.process == "encode": - self.encoded(args) - - if args.process == "text_stream" or args.process == "token_stream": - raise NotImplementedError("This feature has been removed") - - if args.process == "word_db": - self.wordlist(args) - - def init(self, args): - """ - Initialize the current directory as a bookworm directory. - """ - # Create a configuration file - if not args.force: - if os.path.exists(".bookworm"): - logging.error(""" - You already have a folder named '.bookworm'. - Probably you've already initialized a Bookworm here. - """) - return - if not os.path.exists("bookworm.cnf"): - fout = open("bookworm.cnf", "w") - if self.dbname: - loc = self.dbname - else: - loc = os.path.relpath(".", "..") - print("Configuring Bookworm named '{}'".format(loc)) - print("Change the file at bookworm.cnf if this is undesirable".format(loc)) - fout.write("[client]\ndatabase = {}\n".format(loc)) - else: - fout = open("bookworm.cnf", "w") - loc = os.path.relpath(".", "..") - print("Configuring Bookworm named '{}'".format(loc)) - print("Change the file at bookworm.cnf if this is undesirable".format(loc)) - fout.write("[client]\ndatabase = {}\n".format(loc)) - def query(self, args): """ Run a query against the API from the command line. """ - from bookwormDB.general_API import SQLAPIcall + from bookwormDB.general_API import DuckDBCall import json + import duckdb + query = args.APIcall + logger.info(query) + con = duckdb.connect("/drobo/bookworm_dbs/" + query['database'], read_only = True) + caller = DuckDBCall(query = query, con = con) + logger.info(caller.execute()) - query = json.loads(args.APIcall) - caller = SQLAPIcall(query) - print(caller.execute()) - - def serve(self,args): + def serve(self, args): """ Serve the api. """ from bookwormDB.wsgi import run - run(args.bind, args.workers) - - import http.server - from http.server import HTTPServer - import shutil - - base_dir = args.dir - base_cgi_dir = os.path.normpath(base_dir + "/" + "cgi-bin") - d3_dir = os.path.normpath(base_dir + "/" + "D3") - for dir in [base_dir,base_cgi_dir]: - if not os.path.exists(dir): - os.makedirs(dir) - - API = os.path.normpath(os.path.dirname(bookwormDB.__file__) + "/bin/dbbindings.py") - if not os.path.exists(base_cgi_dir + "/" + API): - shutil.copy(API, base_cgi_dir) - - if not os.path.exists(d3_dir): - call(["git","clone","http://github.com/bmschmidt/BookwormD3",d3_dir]) - - # Use the Makefile to build the linechartGUI. This is a little Rube Goldberg-y. - args.target="linechartGUI" - - raise TypeError("The line below this is nonsense") - self.prep(args) - - os.chdir(base_dir) - # Actually serve it. - PORT = args.port - - httpd = HTTPServer(("", PORT), http.server.CGIHTTPRequestHandler) - - print("\n\n" + "****"*20) - print("A local bookworm server is now running") - print("You can now view some charts in a web-browser at http://localhost:%d/D3" % PORT) - print("If you have a time variable, linecharts are at http://localhost:%d/%s" % (PORT,self.dbname)) - print("Please note that this is not a very secure way: if you plan to put your bookworm") - print("on the open web, consider using apache.") - httpd.serve_forever() - - - def extension(self,args): - """ - Creates (or updates) an extension - """ - - if not os.path.exists(self.basedir + ".bookworm/extensions"): - os.makedirs(self.basedir + ".bookworm/extensions") - - my_extension = Extension(args,basedir = self.basedir) - my_extension.clone_or_pull() - my_extension.make() + run(args.port, args.bind, args.workers) def build(self, args): - self.prep(args) - - def prep(self, args): - """ - This is a wrapper to all the functions define here: the purpose - is to continue to allow access to internal methods in, for instance, - the Makefile, without documenting all of them in separate functions. - - That's a little groaty, I know. - """ - logging.debug(args) - - getattr(self, args.goal)(args) - - def wordlist(self, args): - """ - Create a wordlist of the top 1.5 million words. - """ - from .countManager import create_wordlist - if os.path.exists(".bookworm/texts/wordlist/wordlist.txt"): - return - try: - os.makedirs(".bookworm/texts/wordlist") - except FileExistsError: - pass - - input = "input.txt" - if args.feature_counts: - logging.info(args.feature_counts) - input = [a for a in args.feature_counts if 'unigrams' in a][0] - create_wordlist(n = 1.5e06, - input = input, - output = ".bookworm/texts/wordlist/wordlist.txt") - - def pristine(self, args): - - import bookwormDB.CreateDatabase - bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None) - if self.dbname == "mysql": - raise NameError("Don't try to delete the mysql database") - bookworm.db.query("DROP DATABASE IF EXISTS {}".format(self.dbname)) - - def encoded(self, args): - """ - Using the wordlist and catalog, create encoded files. - """ - self.wordlist(args) - self.derived_catalog(args) - - for k in ['unigrams', 'bigrams', 'trigrams', 'quadgrams', 'completed']: - try: - os.makedirs(".bookworm/texts/encoded/{}".format(k)) - except FileExistsError: - pass - from .countManager import encode_words - - if args.feature_counts: - for feature in args.feature_counts: - encode_words(".bookworm/texts/wordlist/wordlist.txt", feature) - else: - encode_words(".bookworm/texts/wordlist/wordlist.txt", "input.txt") - - def all(self, args): - self.preDatabaseMetadata(args) - self.encoded(args) - self.database_wordcounts(args) - self.database_metadata(args) - - def preDatabaseMetadata(self, args=None, **kwargs): - import os - if not os.path.exists("field_descriptions.json"): - self.guessAtFieldDescriptions() - self.derived_catalog(args) - import bookwormDB.CreateDatabase - # Doesn't need a created database yet, just needs access - # to some pieces. - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase() - logging.info("Writing metadata to new catalog file...") - Bookworm.variableSet.writeMetadata() - - # This creates helper files in the /metadata/ folder. - - def derived_catalog(self, args): - - if not os.path.exists(".bookworm/metadata"): - os.makedirs(".bookworm/metadata") - if os.path.exists(".bookworm/metadata/jsoncatalog_derived.txt"): - return - - from bookwormDB.MetaParser import parse_catalog_multicore, ParseFieldDescs - - logging.debug("Preparing to write field descriptions") - ParseFieldDescs(write = True) - logging.debug("Preparing to write catalog") - parse_catalog_multicore() - - def guessAtFieldDescriptions(self, args = None, **kwargs): - - """ - Use a number of rules of thumb to automatically generate a field_descriptions.json file. - This may bin some categories incorrectly (depending on names, for example it may treat dates - as either categorical or time variables). - """ - - import bookwormDB.CreateDatabase - import json - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None) - Bookworm.setVariables("jsoncatalog.txt", jsonDefinition=None) - import os - if not os.path.exists("field_descriptions.json"): - output = open("field_descriptions.json","w") - guess = json.dumps(Bookworm.variableSet.guessAtFieldDescriptions(), indent = 2) - logging.warning("Creating guess for field descriptions at: {}".format(guess)) - output.write(guess) - else: - logging.error(""" - You already have a file at field_descriptions.json - Dying rather than overwrite it. - """) - sys.exit() - - def reload_memory(self,args): - import bookwormDB.CreateDatabase - dbnames = [self.dbname] - if args.all==True: - dbnames = [] - datahandler = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname,variableFile=None) - cursor = datahandler.db.query("SELECT TABLE_SCHEMA FROM information_schema.tables WHERE TABLE_NAME='masterTableTable'") - for row in cursor.fetchall(): - dbnames.append(row[0]) - logging.info("The following databases are bookworms to be reloaded:") - for name in dbnames: - logging.info("\t" + name) - - for database in dbnames: - logging.info("Reloading memory tables for %s" %database) - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(database,variableFile=None) - Bookworm.reloadMemoryTables(force=args.force) - - def database_metadata(self, args): - import bookwormDB.CreateDatabase - logging.debug("creating metadata db") - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname) - Bookworm.variableSet.loadMetadata() - - logging.debug("creating metadata variable tables") - - # This creates a table in the database that makes the results of - # field_descriptions accessible through the API, and updates the - - Bookworm.loadVariableDescriptionsIntoDatabase() - - - Bookworm.create_fastcat_and_wordsheap_disk_tables() - - # The temporary memory tables are no longer automatically created on a build. - # To create them, use `bookworm reload_memory`. - # Bookworm.reloadMemoryTables() - - #print "adding cron job to automatically reload memory tables on launch" - #print "(this assumes this machine is the MySQL server, which need not be the case)" - #call(["sh","scripts/scheduleCronJob.sh"]) - Bookworm.jsonify_data() # Create the self.dbname.json file in the root directory. - Bookworm.create_API_settings() - - Bookworm.grantPrivileges() + from .builder import BookwormCorpus + nc_params = namespace_to_kwargs(args) + db_path = args.db_directory / args.database + corp = BookwormCorpus( + db_location = db_path, + ngrams = args.ngrams, + **nc_params, + cache_set = {"tokenization", "word_counts", + "encoded_unigrams", "document_lengths"}) + corp.build() + def add_metadata(self, args): - import bookwormDB.CreateDatabase - import bookwormDB.convertTSVtoJSONarray - bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname,None) - anchorField = args.key - if args.format == "tsv": - # TSV is just converted into JSON in a file at tmp.txt, and slurped in that way. - if args.key is None: - args.key = open(args.file).readline().split("\t")[0] - f = "tmp.txt" - bookwormDB.convertTSVtoJSONarray.convertToJSON(args.file, f) - args.file = f - - bookworm.importNewFile(args.file, - anchorField=args.key, - jsonDefinition=args.field_descriptions) - - - def database_wordcounts(self, args = None, **kwargs): - """ - Builds the wordcount components of the database. This will die - if you can't connect to the database server. - """ - cmd_args = args - import bookwormDB.CreateDatabase - - index = True - reverse_index = True - ingest = True - newtable = True - - if cmd_args and hasattr(cmd_args, "index_only"): - if cmd_args.index_only: - ingest = False - newtable = False - else: - index = not cmd_args.no_index - newtable = not cmd_args.no_delete - reverse_index = not cmd_args.no_reverse_index - if not (newtable and ingest and index): - logging.warn("database_wordcounts args not supported for bigrams yet.") - - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname) - Bookworm.load_word_list() - Bookworm.create_unigram_book_counts(newtable=newtable, ingest=ingest, index=index, reverse_index=reverse_index) - Bookworm.create_bigram_book_counts() - -class Extension(object): - - """ - A bookworm extension. Initialized with an args object, - which has the element url, the location of a clonable git repo. - - Because I don't want people to have to write extensions in python, - they are build using `make`. - """ - - def __init__(self,args,basedir="./"): - self.args = args - self.dir = basedir + ".bookworm/extensions/" + re.sub(".*/","",self.args.url) - - def clone_or_pull(self): - if not os.path.exists(self.dir): - logging.info("cloning git repo from " + self.args.url) - call(["git","clone",self.args.url,self.dir]) - else: - logging.info("updating pre-existing git repo at " + self.dir) - Popen(["git","pull"],cwd=self.dir) - - def make(self): - logging.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~") - logging.debug("Running make in " + self.dir) - Popen(["make"], cwd=self.dir) - -# Initiate MySQL connection. - - -# Pull a method from command line input. + raise NotImplementedError("Functionality missing in 3.0") def run_arguments(): """ @@ -462,113 +106,49 @@ def run_arguments(): Refactoring pull requests welcome. """ - parser = argparse.ArgumentParser(description='Build and maintain a Bookworm database.',prog="bookworm") + parser = argparse.ArgumentParser( + description='Build and maintain a Bookworm database.', + prog="bookworm") parser.add_argument("--configuration","-c",help="The name of the configuration file to read options from: by default, 'bookworm.cnf' in the current directory.", default="bookworm.cnf") - parser.add_argument("--database","-d",help="The name of the bookworm database in MySQL to connect to: by default, read from the active configuration file.", default=None) - - parser.add_argument("--log-level","-l", help="The logging detail to use for errors. Default is 'warning', only significant problems; info gives a fuller record, and 'debug' dumps many MySQL queries, etc.",choices=["warning","info","debug"],type=str.lower,default="warning") - - parser.add_argument("--feature-counts", action='append', - help="Use pre-calculated feature counts rather than tokenizing complete text on the fly. Supply any number of single files per count level like 'input.unigrams', 'input.bigrams', etc.") + parser.add_argument("--log-level", "-l", + help="The logging detail to use for errors." + "Default is 'warning', only significant problems; info gives a " + "fuller record, and 'debug' dumps many db queries, etc.", + choices=["warning","info","debug"],type=str.lower,default="warning") - parser.add_argument("--ngrams",nargs="+",default=["unigrams","bigrams"],help="What levels to parse with. Multiple arguments should be unquoted in spaces. This option currently does nothing.") + parser.add_argument("--ngrams", type = int, default = 2, help = "How many ngrams to create count tables for. Maximum 5. Large values will dramatically slow creation.") + parser.add_argument("--db-directory", help = "" + "Directory where duckdb databases live.", default = None, type = Path) + parser.add_argument("--database", "-d", help = "" + "The database name inside db-folder for this command. " + "Not relevant for 'serve' commands.", + default = None + ) + # Use subparsers to have an action syntax, like git. - subparsers = parser.add_subparsers(title="action", help='The commands to run with Bookworm', dest="action") - - + subparsers = parser.add_subparsers(title="action", + help='The commands to run with Bookworm', + dest="action") ############# build ################# - build_parser = subparsers.add_parser("build",description = "Create files",help="""Build up the component parts of a Bookworm.\ - - if you specify something far along the line (for instance, the linechart GUI), it will\ - build all prior files as well.""") - build_parser.add_argument("target", help="The make that you want to build. To build a full bookworm, type 'build all'.") + build_parser = subparsers.add_parser("build", + description = "Create files", + help="Build up the component parts of a Bookworm. " + "if you specify something far along the line") - # Grep out all possible targets from the Makefile + # Inherited directly from nonconsumptive.commander. + add_builder_parameters(build_parser) ############# supplement ################# supplement_parser = subparsers.add_parser("add_metadata",help="""Supplement the\ metadata for an already-created Bookworm with new items. They can be keyed to any field already in the database.""") supplement_parser.add_argument("-f","--file",help="""The location of a file with additional metadata to incorporate into your bookworm.""",required=True) - supplement_parser.add_argument( - "--format", - help="""The file format of the new metadata.\ - Must be "json" or "tsv". For JSON, the format is the same as the default\ - jsoncatalog.txt (a text file of json lines, each corresponding to a metadata field);\ - for TSV, a tsv with first line of which is column names,\ - and the first column of which is shared key (like filename). The TSV format,\ - particularly without field descriptions, is much easier to use, but doesn't\ - permit multiple values for the same key.""", - default="json",type=str.lower,choices=["tsv","json"]) - - supplement_parser.add_argument("--key",help="""The name of the key. If not specified and input type is TSV, the first column is used.""",default=None) - supplement_parser.add_argument("--field_descriptions","-d",help="""A description of the new metadata in the format of "field_descriptions.json"; if empty, we'll just guess at some suitable values.""",default=None) - - ######### Reload Memory ############# - memory_tables_parser = subparsers.add_parser("reload_memory",help="Reload the memory\ - tables for the designated Bookworm; this must be done after every MySQL restart") - memory_tables_parser.add_argument("--force-reload",dest="force",action="store_true", - help="Force reload on all memory tables. Use\ - '--skip-reload' for faster execution. On by default\ - .") - memory_tables_parser.add_argument("--skip-reload",dest="force",action="store_false", - help="Don't reload memory tables which have at least\ - one entry in them. Significantly faster, but may produce\ - bad results if the underlying tables have been\ - changed. Good for maintenance, bad for actively updated\ - installations.") - memory_tables_parser.set_defaults(force=False) - memory_tables_parser.add_argument("--all",action="store_true",default=False, - help="Search for all bookworm installations on\ - the server, and reload memory tables for each of them.") - - - ########## Clone and run extensions - extensions_parser = subparsers.add_parser("extension", help="Install Extensions to the current directory") - extensions_parser.add_argument("url",help="A cloneable url for the extension you want to pul: passed as an argument to 'git clone,' so may be either using the https protocol or the git protocol") - - - ########## Clone and run extensions - extensions_parser = subparsers.add_parser("query", help="Run a query using the Bookworm API") - extensions_parser.add_argument("APIcall",help="The json-formatted query to be run.") - - - ########## Build components - extensions_parser = subparsers.add_parser("prep", help="Build individual components.", aliases = ['build']) - extensions_subparsers = extensions_parser.add_subparsers(title="goal", help="The name of the target.", dest="goal") - - # Bookworm prep targets that allow additional args - catalog_prep_parser = extensions_subparsers.add_parser("preDatabaseMetadata", - help=getattr(BookwormManager, "preDatabaseMetadata").__doc__) - - word_ingest_parser = extensions_subparsers.add_parser("database_wordcounts", - help=getattr(BookwormManager, "database_wordcounts").__doc__) - word_ingest_parser.add_argument("--no-delete", action="store_true", help="Do not delete and rebuild the token tables. Useful for a partially finished ingest.") - - word_ingest_parser.add_argument("--no-reverse-index", action="store_true", help="When creating the table, choose not to index bookid/wordid/counts. This is useful for really large builds. Because this is specified at table creation time, it does nothing with --no-delete or --index-only.") - - word_ingest_parser.add_argument("--no-index", action="store_true", help="Do not re-enable keys after ingesting tokens. Only do this if you intent to manually enable keys or will run this command again.") - - word_ingest_parser.add_argument("--index-only", action="store_true", help="Only re-enable keys. Supercedes other flags.") - - # Bookworm prep targets that don't allow additional args - for prep_arg in BookwormManager.__dict__.keys(): - extensions_subparsers.add_parser(prep_arg, help=getattr(BookwormManager, prep_arg).__doc__) - - """ - Some special functions - """ - - init_parser = subparsers.add_parser("init",help="Initialize the current directory as a bookworm directory") - init_parser.add_argument("--force","-f",help="Overwrite some existing files.",default=False,action="store_true") - init_parser.add_argument("--yes","-y",help="Automatically use default values with no prompts",default=False,action="store_true") - # Serve the current bookworm @@ -578,13 +158,19 @@ def run_arguments(): "the gunicorn endpoint behind a more powerful webserver like apache or nginx.") serve_parser.add_argument("--full-site", action = "store_true", help="Serve a webpage as well as a query endpoint? Not active.") - - serve_parser.add_argument("--bind", "-b", default="10012", help="The port over which to serve the bookworm",type=int) - + serve_parser.add_argument("--port", "-p", default="10012", help="The port over which to serve the bookworm", type=int) + serve_parser.add_argument("--bind", "-b", default="127.0.0.1", help="The IP address to bind the server to.", type=str) serve_parser.add_argument("--workers", "-w", default="0", help="How many gunicorn worker threads to launch for the API. Reduce if you're seeing memory issues.",type=int) - serve_parser.add_argument("--dir","-d",default="http_server",help="A filepath for a directory to serve from. Will be created if it does not exist.") - +# serve_parser.add_argument("--API", "-a", default="MySQL", +# help="The type of API endpoint to run. 'MySQL' will" +# "will run MySQL") + serve_parser.add_argument("--cache", default = "none", + help="cache locations?") + serve_parser.add_argument("--cold-storage", default = "none", + help="A folder with cached query results. Allows long-term cold-storage.") + serve_parser.add_argument("--remote-host", default = None, + help="Hosts to pass queries through to. If enabled.") # Configure the global server. @@ -593,20 +179,44 @@ def run_arguments(): configure_parser.add_argument("--users",nargs="+",choices=["admin","global","root"],help="The user levels you want to act on.",default=["admin","global"]) configure_parser.add_argument("--force","-f",help="Overwrite existing configurations in potentially bad ways.",action="store_true",default=False) + + configure_parser = subparsers.add_parser("query", help="query the API directly. Inefficient compared to using a running host.") + configure_parser.add_argument("APIcall", help="A JSON string.", type = json.loads) + + # Call the function args = parser.parse_args() + if args.db_directory is None: + args.db_directory = Path(default_db_directory()) + if args.db_directory is None: + raise ValueError("You must specify a db directory or include one in a local config file.") + # stash those away. + store()['args'] = args # Set the logging level based on the input. numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % args.log_level) # While we're at it, log with line numbers FORMAT = "[%(filename)s:%(lineno)s-%(funcName)s() %(asctime)s.%(msecs)03d] %(message)s" - logging.basicConfig(format=FORMAT, level=numeric_level, datefmt="%I:%M:%S") - logging.info("Info logging enabled.") - logging.info("Debug logging enabled.") + logging.basicConfig(format=FORMAT, datefmt="%I:%M:%S") + for logger_name in ["nonconsumptive", "bookworm"]: + logging.getLogger(logger_name).setLevel(numeric_level) + + logger.info("Info logging enabled.") + logger.debug("Debug logging enabled.") # Create the bookworm my_bookworm = BookwormManager(args.configuration, args.database) # Call the current action with the arguments passed in. - getattr(my_bookworm,args.action)(args) + # bookworm build --carefully + # becomes + # BookwormMangager.build(carefully = True) + getattr(my_bookworm, args.action)(args) + +def default_db_directory(): + for p in [Path.home() / ".bookworm.yml"]: + if p.exists(): + ks = yaml.safe_load(p.open()) + if "db_directory" in ks: + return ks["db_directory"] \ No newline at end of file diff --git a/bookwormDB/mariaDB.py b/bookwormDB/mariaDB.py deleted file mode 100644 index c9ebc9c..0000000 --- a/bookwormDB/mariaDB.py +++ /dev/null @@ -1,1034 +0,0 @@ -#!/usr/local/bin/python - -from .variableSet import to_unicode -from .search_limits import Search_limits -from .bwExceptions import BookwormException - -import json -import re -import copy -import MySQLdb -import hashlib -import logging - - - -# If you have bookworms stored on a different host, you can create more lines -# like this. -# A different host and read_default_file will let you import things onto a -# different server. - -class DbConnect(object): - # This is a read-only account - def __init__(self, database=None, - host=None): - - self.dbname = database - - import bookwormDB.configuration - conf = bookwormDB.configuration.Configfile("read_only").config - - if database is None: - raise BookwormException("You must specify a database") - - connargs = { - "db": database, - "use_unicode": 'True', - "charset": 'utf8', - "user": conf.get("client", "user"), - "password": conf.get("client", "password") - } - - if host: - connargs['host'] = host - # For back-compatibility: - else: - connargs['host'] = "localhost" - - try: - self.db = MySQLdb.connect(**connargs) - except: - try: - # Sometimes mysql wants to connect over this rather than a socket: - # falling back to it for backward-compatibility. - connargs["host"] = "127.0.0.1" - self.db = MySQLdb.connect(**connargs) - except: - raise - - self.cursor = self.db.cursor() - -def fail_if_nonword_characters_in_columns(input): - keys = all_keys(input) - for key in keys: - if re.search(r"[^A-Za-z_$*0-9]", key): - logging.error("{} has nonword character".format(key)) - raise - - -def all_keys(input): - """ - Recursive function. Get every keyname in every descendant of a dictionary. - Iterates down on list and dict structures to search for more dicts with - keys. - """ - values = [] - if isinstance(input, dict): - values = list(input.keys()) - for key in list(input.keys()): - values = values + all_keys(input[key]) - if isinstance(input, list): - for value in input: - valleys = all_keys(value) - for val in valleys: - values.append(val) - return values - -# The basic object here is a 'Query:' it takes dictionary as input, -# as defined in the API, and returns a value -# via the 'execute' function whose behavior -# depends on the mode that is passed to it. -# Given the dictionary, it can return a number of objects. -# The "Search_limits" array in the passed dictionary determines how many -# elements it returns; this lets multiple queries be bundled together. -# Most functions describe a subquery that might be combined into one big query -# in various ways. - -def check_query(query): - - - fail_if_nonword_characters_in_columns(query) - - for key in ['database']: - if not key in query: - raise BookwormException({"code": 400, "message": "You must specify a value for {}".format(key)}) - - - if query['method'] in ["schema", "search"]: - # Queries below this only apply to "data" - return - - for v in query['counttype']: - if not v in ['WordCount', 'TextCount']: - raise BookwormException({"code": 400, "message": 'Only "WordCount" and "TextCount"' - ' counts are supported by the SQL api, but passed {}'.format(v)}) - - -class Query(object): - """ - The base class for a bookworm search. - """ - def __init__(self, query_object = {}, db = None, databaseScheme = None): - # Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it. - - check_query(query_object) - - self.prefs = {'database': query_object['database']} - - self.query_object = query_object - - self.db = db - if db is None: - self.db = DbConnect(query_object['database']) - - self.databaseScheme = databaseScheme - if databaseScheme is None: - self.databaseScheme = databaseSchema(self.db) - - self.cursor = self.db.cursor - - # Some tablenames. - - self.wordsheap = self.databaseScheme.fallback_table('wordsheap') - self.fastcat = self.databaseScheme.fallback_table("fastcat") - logging.info("Catalog set to {}".format(self.fastcat)) - self.words = "words" - - self.defaults(query_object) # Take some defaults - - self.derive_variables() # Derive some useful variables that the query will use. - - def defaults(self, query_object): - # these are default values;these are the only values that can be set in the query - # search_limits is an array of dictionaries; - # each one contains a set of limits that are mutually independent - # The other limitations are universal for all the search limits being set. - - - - self.wordsTables = None - - - # Set up a dictionary for the denominator of any fraction if it doesn't already exist: - self.search_limits = query_object.setdefault('search_limits', [{"word":["polka dot"]}]) - self.words_collation = query_object.setdefault('words_collation', "Case_Insensitive") - - lookups = {"Case_Insensitive":'word', 'lowercase':'lowercase', 'casesens':'casesens', "case_insensitive":"word", "Case_Sensitive":"casesens", "All_Words_with_Same_Stem":"stem", 'stem':'stem'} - self.word_field = lookups[self.words_collation] - - self.time_limits = query_object.setdefault('time_limits', [0, 10000000]) - self.time_measure = query_object.setdefault('time_measure', 'year') - - self.groups = set() - self.outerGroups = [] - self.finalMergeTables = set() - - try: - groups = query_object['groups'] - except: - groups = None - - if groups == [] or groups == ["unigram"]: - # Set an arbitrary column name that will always be true if nothing else is set. - pass - # groups.insert(0, "1 as In_Library") - - if groups is None: - # A user query can't demand ungrouped results, - # but internally it's represented as None. - groups = [] - - for group in groups: - - # There's a special set of rules for how to handle unigram and bigrams - multigramSearch = re.match("(unigram|bigram|trigram)(\d)?", group) - - if multigramSearch: - if group == "unigram": - gramPos = "1" - gramType = "unigram" - - else: - gramType = multigramSearch.groups()[0] - try: - gramPos = multigramSearch.groups()[1] - except: - print("currently you must specify which bigram element you want (eg, 'bigram1')") - raise - - lookupTableName = "%sLookup%s" %(gramType, gramPos) - self.outerGroups.append("%s.%s as %s" %(lookupTableName, self.word_field, group)) - self.finalMergeTables.add(" JOIN %s as %s ON %s.wordid=w%s" %(self.wordsheap, lookupTableName, lookupTableName, gramPos)) - self.groups.add("words%s.wordid as w%s" %(gramPos, gramPos)) - - else: - self.outerGroups.append(group) - try: - if self.databaseScheme.aliases[group] != group: - # Search on the ID field, not the basic field. - # debug(self.databaseScheme.aliases.keys()) - self.groups.add(self.databaseScheme.aliases[group]) - table = self.databaseScheme.tableToLookIn[group] - - joinfield = self.databaseScheme.aliases[group] - self.finalMergeTables.add(" JOIN " + table + " USING (" + joinfield + ") ") - else: - self.groups.add(group) - except KeyError: - self.groups.add(group) - - """ - There are the selections which can include table refs, and the groupings, which may not: - and the final suffix to enable fast lookup - """ - - self.selections = ",".join(self.groups) - self.groupings = ",".join([group for group in self.groups]) - - self.joinSuffix = "" + " ".join(self.finalMergeTables) - - """ - Define the comparison set if a comparison is being done. - """ - - self.counttype = query_object.setdefault('counttype', ["WordCount"]) - - if isinstance(self.counttype, (str, bytes)): - self.counttype = [self.counttype] - - def determineOutsideDictionary(self): - """ - deprecated--tagged for deletion. - """ - self.compare_dictionary = copy.deepcopy(self.query_object) - if 'compare_limits' in list(self.query_object.keys()): - self.compare_dictionary['search_limits'] = self.query_object['compare_limits'] - del self.query_object['compare_limits'] - elif sum([bool(re.search(r'\*', string)) for string in list(self.query_object['search_limits'].keys())]) > 0: - # If any keys have stars at the end, drop them from the compare set - # This is often a _very_ helpful definition for succinct comparison queries of many types. - # The cost is that an asterisk doesn't allow you - - for key in list(self.query_object['search_limits'].keys()): - if re.search(r'\*', key): - # rename the main one to not have a star - self.query_object['search_limits'][re.sub(r'\*', '', key)] = self.query_object['search_limits'][key] - # drop it from the compare_limits and delete the version in the search_limits with a star - del self.query_object['search_limits'][key] - del self.compare_dictionary['search_limits'][key] - else: # if nothing specified, we compare the word to the corpus. - deleted = False - for key in list(self.query_object['search_limits'].keys()): - if re.search('words?\d', key) or re.search('gram$', key) or re.match(r'word', key): - del self.compare_dictionary['search_limits'][key] - deleted = True - if not deleted: - # If there are no words keys, just delete the first key of any type. - # Sort order can't be assumed, but this is a useful failure mechanism of last resort. Maybe. - try: - del self.compare_dictionary['search_limits'][list(self.query_object['search_limits'].keys())[0]] - except: - pass - """ - The grouping behavior here is not desirable, but I'm not quite sure how yet. - Aha--one way is that it accidentally drops out a bunch of options. I'm just disabling it: let's see what goes wrong now. - """ - - def derive_variables(self): - # These are locally useful, and depend on the search limits put in. - self.limits = self.search_limits - - # Treat empty constraints as nothing at all, not as restricting to the set of nothing. - for key in list(self.limits.keys()): - if self.limits[key] == []: - del self.limits[key] - - if 'word' in self.limits: - self.word_limits = True - else: - self.word_limits = False - - self.set_operations() - - self.create_catalog_table() - - self.make_catwhere() - - self.make_wordwheres() - - def tablesNeededForQuery(self, fieldNames=[]): - """ - Deprecated. - """ - db = self.db - neededTables = set() - tablenames = dict() - tableDepends = dict() - - q = "SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);" - logging.debug(q) - db.cursor.execute(q) - for row in db.cursor.fetchall(): - tablenames[row[0]] = row[2] - tableDepends[row[2]] = row[3] - - for fieldname in fieldNames: - parent = "" - try: - current = tablenames[fieldname] - neededTables.add(current) - n = 1 - while parent not in ['fastcat', 'wordsheap']: - parent = tableDepends[current] - neededTables.add(parent) - current = parent - n+=1 - if n > 100: - raise TypeError("Unable to handle this; seems like a recursion loop in the table definitions.") - # This will add 'fastcat' or 'wordsheap' exactly once per entry - except KeyError: - pass - - return neededTables - - def needed_columns(self): - """ - Given a query, what are the columns that the compiled search will need materialized? - - Important for joining appropriate tables to the search. - - Needs a recursive function so it will find keys deeply nested inside "$or" searches. - """ - cols = [] - - def pull_keys(entry): - val = [] - if isinstance(entry,list) and not isinstance(entry,(str, bytes)): - for element in entry: - val += pull_keys(element) - elif isinstance(entry,dict): - for k,v in entry.items(): - if k[0] != "$": - val.append(k) - else: - val += pull_keys(v) - else: - return [] - - return [re.sub(" .*","",key) for key in val] - - return pull_keys(self.limits) - - def wordid_query(self): - return self.wordswhere - - if self.wordswhere != " TRUE ": - f = "SELECT wordid FROM {words} as words1 WHERE {wordswhere}".format(**self.__dict__) - logging.debug("`" + self.wordswhere + "`") - return " wordid IN ({})".format(f) - else: - return " TRUE " - - def make_group_query(self): - aliases = [self.databaseScheme.aliases[g] for g in self.query_object["groups"]] - if len(aliases) > 0: - return "GROUP BY {}".format(", ".join(aliases)) - else: - return " " - - - def main_table(self): - if self.gram_size() == 1: - return 'master_bookcounts as main' - if self.gram_size() == 2: - return 'master_bigrams as main' - - def full_query_tables(self): - # Joins are needed to provide groups, but *not* to provide - # provide evidence for wheres. - - # But if there's a group, there may also need to be an associated where. - - if self.word_limits == False: - tables = [self.fastcat] - else: - tables = [self.main_table()] - - - cols = self.query_object['groups'] - ts = self.databaseScheme.tables_for_variables(cols) - - for t in ts: - if not t in tables: - tables.append(t) - - return tables - - def make_join_query(self): - tables = self.full_query_tables() - return " NATURAL JOIN ".join(tables) - - - def base_query(self): - dicto = {} - dicto['finalGroups'] = ', '.join(self.query_object['groups']) - if dicto['finalGroups'] != '': - dicto['finalGroups'] = ", " + dicto['finalGroups'] - - dicto['group_query'] = self.make_group_query() - dicto['op'] = ', '.join(self.set_operations()) - dicto['bookid_where'] = self.bookid_query() - dicto['wordid_where'] = self.wordid_query() - dicto['tables'] = self.make_join_query() - logging.info("'{}'".format(dicto['tables'])) - - dicto['catwhere'] = self.make_catwhere("main") - - basic_query = """ - SELECT {op} {finalGroups} - FROM {tables} - WHERE - {bookid_where} - AND - {wordid_where} - AND {catwhere} - {group_query} - """.format(**dicto) - - return basic_query - - def create_catalog_table(self): - # self.catalog = self.prefs['fastcat'] # 'catalog' # Can be replaced with a more complicated query in the event of longer joins. - - """ - - This should check query constraints against a list of tables, and - join to them. So if you query with a limit on LCSH, and LCSH - is listed as being in a separate table, it joins the table - "LCSH" to catalog; and then that table has one column, ALSO - called "LCSH", which is matched against. This allows a bookid - to be a member of multiple catalogs. - - """ - - self.relevantTables = set() - - databaseScheme = self.databaseScheme - - cols = self.needed_columns() - cols = [c for c in cols if not c in ["word", "word1", "word2"]] - - self.relevantTables = self.databaseScheme.tables_for_variables(cols) - - # moreTables = self.tablesNeededForQuery(columns) - - - self.catalog = " NATURAL JOIN ".join(self.relevantTables) - return self.catalog -# for table in self.relevantTables: -# if table!="fastcat" and table!="words" and table!="wordsheap" and table!="master_bookcounts" and table!="master_bigrams" and table != "fastcat_" and table != "wordsheap_": -# self.catalog = self.catalog + """ NATURAL JOIN """ + table + " "# -# -# return self.catalog - - - def make_catwhere(self, query = "sub"): - # Where terms that don't include the words table join. Kept separate so that we can have subqueries only working on one half of the stack. - catlimits = dict() - - for key in list(self.limits.keys()): - # !!Warning--none of these phrases can be used in a bookworm as a custom table names. - - if key not in ('word', 'word1', 'word2', 'hasword') and not re.search("words\d", key): - catlimits[key] = self.limits[key] - - if query == "main": - ts = set(self.full_query_tables()) - for key in list(catlimits.keys()): - logging.debug(key) - logging.debug(ts) - if not (key in ts or key + "__id" in ts): - logging.info("removing {}".format(key)) - del catlimits[key] - - if len(list(catlimits.keys())) > 0: - catwhere = where_from_hash(catlimits) - else: - catwhere = "TRUE" - if query == "sub": - self.catwhere = catwhere - return catwhere - - def gram_size(self): - try: - ls = [phrase.split() for phrase in self.limits['word']] - except: - return 0 - lengths = list(set(map(len, ls))) - if len(lengths) > 1: - raise BookwormException('400', 'Must pass all unigrams or all bigrams') - else: - return lengths[0] - - - - def make_wordwheres(self): - self.wordswhere = " TRUE " - - limits = [] - - if self.word_limits: - """ - - This doesn't currently allow mixing of one and two word searches - together in a logical way. It might be possible to just - join on both the tables in MySQL--I'm not completely sure - what would happen. But the philosophy has been to keep - users from doing those searches as far as possible in any - case. - - """ - - - - for phrase in self.limits['word']: - locallimits = dict() - array = phrase.split() - for n, word in enumerate(array): - searchingFor = word - if self.word_field == "stem": - from nltk import PorterStemmer - searchingFor = PorterStemmer().stem_word(searchingFor) - if self.word_field == "case_insensitive" or \ - self.word_field == "Case_Insensitive": - # That's a little joke. Get it? - searchingFor = searchingFor.lower() - - - selectString = "SELECT wordid FROM %s WHERE %s = %%s" % (self.wordsheap, self.word_field) - logging.debug(selectString) - cursor = self.db.cursor - cursor.execute(selectString,(searchingFor,)) - - # Set the search key being used. - search_key = "wordid" - if self.gram_size() > 1: - # 1-indexed entries in the bigram tables. - search_key = "word{}".format(n + 1) - - for row in cursor.fetchall(): - wordid = row[0] - try: - locallimits[search_key] += [wordid] - except KeyError: - locallimits[search_key] = [wordid] - - if len(locallimits) > 0: - limits.append(where_from_hash(locallimits, comp = " = ", escapeStrings=False)) - - - self.wordswhere = "(" + ' OR '.join(limits) + ")" - if limits == []: - # In the case that nothing has been found, tell it explicitly to search for - # a condition when nothing will be found. - self.wordswhere = "bookid = -1" - - wordlimits = dict() - - limitlist = copy.deepcopy(list(self.limits.keys())) - - for key in limitlist: - if re.search("words\d", key): - wordlimits[key] = self.limits[key] - self.max_word_length = max(self.max_word_length, 2) - del self.limits[key] - - if len(list(wordlimits.keys())) > 0: - self.wordswhere = where_from_hash(wordlimits) - - return self.wordswhere - - def build_wordstables(self): - # Deduce the words tables we're joining against. - # The iterating on this can be made more general to get 3 or four grams in pretty easily. - # This relies on a determination already having been made about whether - # this is a unigram or bigram search; that's reflected in the self.selections - # variable. - - if self.wordsTables is not None: - return - - needsBigrams = (self.max_word_length == 2 or re.search("words2", self.selections)) - - needsUnigrams = self.max_word_length == 1; - - if self.max_word_length > 2: - err = dict(code=400, message="Phrase is longer than what Bookworm currently supports") - raise BookwormException(err) - - if needsBigrams: - self.main = ''' - master_bigrams as main - ''' - - self.wordstables = """ - JOIN %(wordsheap)s as words1 ON (main.word1 = words1.wordid) - JOIN %(wordsheap)s as words2 ON (main.word2 = words2.wordid) """ % self.__dict__ - - # I use a regex here to do a blanket search for any sort of word limitations. That has some messy sideffects (make sure the 'hasword' - # key has already been eliminated, for example!) but generally works. - - elif needsUnigrams: - self.main = ''' - master_bookcounts as main - ''' - - self.wordstables = """ - JOIN ( %(wordsheap)s as words1) ON (main.wordid = words1.wordid) - """ % self.__dict__ - - else: - """ - Have _no_ words table if no words searched for or grouped by; - instead just use nwords. This - means that we can use the same basic functions both to build the - counts for word searches and - for metadata searches, which is valuable because there is a - metadata-only search built in to every single ratio - query. (To get the denominator values). - - Call this OLAP, if you like. - """ - self.main = " " - self.operation = ','.join(self.set_operations(with_words = False)) - """ - This, above is super important: the operation used is relative to the counttype, and changes to use 'catoperation' instead of 'bookoperation' - That's the place that the denominator queries avoid having to do a table scan on full bookcounts that would take hours, and instead takes - milliseconds. - """ - self.wordstables = " " - self.wordswhere = " TRUE " - # Just a dummy thing to make the SQL writing easier. Shouldn't take any time. Will usually be extended with actual conditions. - - def set_operations(self): - - with_words = self.word_limits - - output = [] - - # experimental - if self.query_object['counttype'] == 'bookid': - return ['bookid'] - - if self.query_object['counttype'] == 'wordid': - return ['wordid'] - - - if with_words: - if "TextCount" in self.query_object['counttype']: - output.append("count(DISTINCT main.bookid) as TextCount") - if "WordCount" in self.query_object['counttype']: - output.append("sum(main.count) as WordCount") - else: - if "WordCount" in self.query_object['counttype']: - output.append("sum(nwords) as WordCount") - if "TextCount" in self.query_object['counttype']: - output.append("count(nwords) as TextCount") - - return output - - def bookid_query(self): - - q = "SELECT bookid FROM {catalog} WHERE {catwhere}""".format(**self.__dict__) - - logging.debug("'{}'".format(self.catwhere)) - - if self.catwhere == "TRUE": - self.bookid_where = " TRUE " - - else: - self.bookid_where = " bookid IN ({}) ".format(q) - - - return self.bookid_where - - def query(self): - - """ - Return the SQL query that fills the API request. - - There must be a search method filled out. - """ - - if (self.query_object['method'] == 'schema'): - return "SELECT name,type,description,tablename,dbname,anchor FROM masterVariableTable WHERE status='public'" - elif (self.query_object['method'] == 'search'): - return self.bibliography_query() - elif self.query_object['method'] == 'data': - return self.base_query() - else: - raise BookwormException('400', 'Must enter "schema", "search", or "data" as method') - - - def bibliography_query(self, limit = "100"): - # I'd like to redo this at some point so it could work as an API call more naturally. - self.limit = limit - self.ordertype = "sum(main.count*10000/nwords)" - try: - if self.query_object['ordertype'] == "random": - if self.counttype in [ - "WordCount" - ]: - self.ordertype = "RAND()" - else: - # This is a based on an attempt to match various - # different distributions I found on the web somewhere to give - # weighted results based on the counts. It's not perfect, but might - # be good enough. Actually doing a weighted random search is not easy without - # massive memory usage inside sql. - self.ordertype = "RAND()" - # self.ordertype = "LOG(1-RAND())/sum(main.count)" - except KeyError: - pass - - # If IDF searching is enabled, we could add a term like '*IDF' here to overweight better selecting words - # in the event of a multiple search. - self.idfterm = "" - prep = self.base_query() - -# if self.main == " ": -# self.ordertype = "RAND()" - - dicto = { - 'fastcat': self.fastcat, - 'tables': self.make_join_query(), - 'ordertype': self.ordertype, - 'catwhere': self.make_catwhere("main"), - 'limit': limit - } - - dicto['bookid_where'] = self.bookid_query() - dicto['wordid_where'] = self.wordid_query() - - bibQuery = """ - SELECT searchstring - FROM catalog RIGHT JOIN ( - SELECT {fastcat}.bookid, {ordertype} as ordering - FROM - {tables} - WHERE - {bookid_where} AND {wordid_where} and {catwhere} - GROUP BY bookid ORDER BY {ordertype} DESC LIMIT {limit} - ) as tmp USING (bookid) ORDER BY ordering DESC; - """.format(**dicto) - return bibQuery - - def search_results(self): - # This is an alias that is handled slightly differently in - # APIimplementation (no "RESULTS" bit in front). Once - # that legacy code is cleared out, they can be one and the same. - - return json.loads(self.return_books()) - - def getActualSearchedWords(self): - # - if len(self.wordswhere) > 7: - words = self.query_object['search_limits']['word'] - # Break bigrams into single words. - words = ' '.join(words).split(' ') - q = "SELECT word FROM {} WHERE {}".format(self.wordsheap, where_from_hash({self.word_field:words})) - logging.debug(q) - self.cursor.execute(q) - self.actualWords = [item[0] for item in self.cursor.fetchall()] - else: - raise TypeError("Suspiciously low word count") - self.actualWords = ["tasty", "mistake", "happened", "here"] - - def custom_SearchString_additions(self, returnarray): - """ - It's nice to highlight the words searched for. This will be on partner web sites, so requires custom code for different databases - """ - db = self.query_object['database'] - if db in ('jstor', 'presidio', 'ChronAm', 'LOC', 'OL'): - self.getActualSearchedWords() - if db == 'jstor': - joiner = "&searchText=" - preface = "?Search=yes&searchText=" - urlRegEx = "http://www.jstor.org/stable/\d+" - if db == 'presidio' or db == 'OL': - joiner = "+" - preface = "# page/1/mode/2up/search/" - urlRegEx = 'http://archive.org/stream/[^"# ><]*' - if db in ('ChronAm', 'LOC'): - preface = "/;words=" - joiner = "+" - urlRegEx = 'http://chroniclingamerica.loc.gov[^\"><]*/seq-\d+' - newarray = [] - for string in returnarray: - try: - base = re.findall(urlRegEx, string)[0] - newcore = ' search inside ' - string = re.sub("^", "", string) - string = re.sub("$", "", string) - string = string+newcore - except IndexError: - pass - newarray.append(string) - # Arxiv is messier, requiring a whole different URL interface: http://search.arxiv.org:8081/paper.jsp?r=1204.3352&qs=netwokr - else: - newarray = returnarray - return newarray - - def execute(self): - # This performs the query using the method specified in the passed parameters. - if self.method == "Nothing": - pass - else: - value = getattr(self, self.method)() - return value - -class databaseSchema(object): - """ - This class stores information about the database setup that is used to optimize query creation query - and so that queries know what tables to include. - It's broken off like this because it might be usefully wrapped around some of - the backend features, - because it shouldn't be run multiple times in a single query (that spawns two instances of itself), - as was happening before. - - It's closely related to some of the classes around variables and - variableSets in the Bookworm Creation scripts, - but is kept separate for now: that allows a bit more flexibility, - but is probaby a Bad Thing in the long run. - """ - - def __init__(self, db): - self.db = db - self.cursor=db.cursor - # has of what table each variable is in - self.tableToLookIn = {} - - # hash of what the root variable for each search term is (eg, - # 'author_birth' might be crosswalked to 'authorid' in the - # main catalog.) - self.anchorFields = {} - - # aliases: a hash showing internal identifications codes that - # dramatically speed up query time, but which shouldn't be - # exposed. So you can run a search for "state," say, and the - # database will group on a 50-element integer code instead of - # a VARCHAR that has to be long enough to support - # "Massachusetts" and "North Carolina." A couple are - # hard-coded in, but most are derived by looking for fields - # that end in the suffix "__id" later. - - # The aliases starts with a dummy alias for fully grouped queries. - self.aliases = {} - self.newStyle(db) - - - def newStyle(self, db): - - self.tableToLookIn['bookid'] = self.fallback_table('fastcat') - self.tableToLookIn['filename'] = self.fallback_table('fastcat') - ff = self.fallback_table('fastcat') - self.anchorFields[ff] = ff - - self.tableToLookIn['wordid'] = self.fallback_table('wordsheap') - self.tableToLookIn['word'] = self.fallback_table('wordsheap') - - ww = self.fallback_table('wordsheap') - self.anchorFields[ww] = ww - - - tablenames = dict() - tableDepends = dict() - q = "SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);" - logging.debug(q) - db.cursor.execute(q) - - for row in db.cursor.fetchall(): - (dbname, alias, tablename, dependsOn) = row - tablename = self.fallback_table(tablename) - dependsOn = self.fallback_table(dependsOn) - - self.tableToLookIn[dbname] = tablename - self.anchorFields[tablename] = dependsOn - - self.aliases[dbname] = alias - - def fallback_table(self,tabname): - """ - Fall back to the saved versions if the memory tables are unpopulated. - - Use a cache first to avoid unnecessary queries, though the overhead shouldn't be much. - """ - tab = tabname - if tab.endswith("_"): - return tab - if tab in ["words","master_bookcounts","master_bigrams","catalog"]: - return tab - - if not hasattr(self,"fallbacks_cache"): - self.fallbacks_cache = {} - - if tabname in self.fallbacks_cache: - return self.fallbacks_cache[tabname] - - q = "SELECT COUNT(*) FROM {}".format(tab) - logging.debug(q) - try: - self.db.cursor.execute(q) - length = self.db.cursor.fetchall()[0][0] - if length==0: - tab += "_" - except MySQLdb.ProgrammingError: - tab += "_" - - self.fallbacks_cache[tabname] = tab - - return tab - - def tables_for_variables(self, variables, tables = []): - tables = [] - - for variable in variables: - lookup_table = self.tableToLookIn[variable] - if lookup_table in tables: - continue - tables.append(lookup_table) - while True: - anchor = self.fallback_table(self.anchorFields[lookup_table]) - if anchor in tables: - break - else: - tables.append(anchor) - lookup_table = anchor - - return tables - - - -def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_joiner = " OR "): - whereterm = [] - # The general idea here is that we try to break everything in search_limits down to a list, and then create a whereterm on that joined by whatever the 'joiner' is ("AND" or "OR"), with the comparison as whatever comp is ("=",">=",etc.). - # For more complicated bits, it gets all recursive until the bits are all in terms of list. - if joiner is None: - joiner = " AND " - for key in list(myhash.keys()): - values = myhash[key] - if isinstance(values, (str, bytes)) or isinstance(values, int) or isinstance(values, float): - # This is just human-being handling. You can pass a single value instead of a list if you like, and it will just convert it - # to a list for you. - values = [values] - # Or queries are special, since the default is "AND". This toggles that around for a subportion. - - if key == "$or" or key == "$OR": - local_set = [] - for comparison in values: - local_set.append(where_from_hash(comparison, comp=comp)) - whereterm.append(" ( " + " OR ".join(local_set) + " )") - elif key == '$and' or key == "$AND": - for comparison in values: - whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) - elif isinstance(values, dict): - if joiner is None: - joiner = " AND " - # Certain function operators can use MySQL terms. - # These are the only cases that a dict can be passed as a limitations - operations = {"$gt":">", "$ne":"!=", "$lt":"<", - "$grep":" REGEXP ", "$gte":">=", - "$lte":"<=", "$eq":"="} - - for operation in list(values.keys()): - if operation == "$ne": - # If you pass a lot of ne values, they must *all* be false. - subjoiner = " AND " - else: - subjoiner = " OR " - whereterm.append(where_from_hash({key:values[operation]}, comp=operations[operation], list_joiner=subjoiner)) - elif isinstance(values, list): - # and this is where the magic actually happens: - # the cases where the key is a string, and the target is a list. - if isinstance(values[0], dict): - # If it's a list of dicts, then there's one thing that happens. - # Currently all types are assumed to be the same: - # you couldn't pass in, say {"year":[{"$gte":1900}, 1898]} to - # catch post-1898 years except for 1899. Not that you - # should need to. - for entry in values: - whereterm.append(where_from_hash(entry)) - else: - # Note that about a third of the code is spent on escaping strings. - if escapeStrings: - if isinstance(values[0], (str, bytes)): - quotesep = "'" - else: - quotesep = "" - - def escape(value): - # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. - return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8') - else: - def escape(value): - return to_unicode(value) - quotesep = "" - - joined = list_joiner.join([" ({}{}{}{}{}) ".format(key, comp, quotesep, escape(value), quotesep) for value in values]) - whereterm.append(" ( {} ) ".format(joined)) - - if len(whereterm) > 1: - return "(" + joiner.join(whereterm) + ")" - else: - return whereterm[0] - # This works pretty well, except that it requires very specific sorts of terms going in, I think. diff --git a/bookwormDB/multiprocessingHelp.py b/bookwormDB/multiprocessingHelp.py index f1b02bc..c0d0cc7 100644 --- a/bookwormDB/multiprocessingHelp.py +++ b/bookwormDB/multiprocessingHelp.py @@ -1,6 +1,7 @@ import os import psutil import logging +logger = logging.getLogger("bookworm") def mp_stats(): try: @@ -13,7 +14,7 @@ def mp_stats(): memory = int(psutil.virtual_memory()[4]) if memory < 1024: - logging.warning("Not much memory to work with--vocab may be inexact") + logger.warning("Not much memory to work with--vocab may be inexact") return (cpus, memory) diff --git a/bookwormDB/query_cache.py b/bookwormDB/query_cache.py new file mode 100644 index 0000000..cc8367e --- /dev/null +++ b/bookwormDB/query_cache.py @@ -0,0 +1,85 @@ +import pyarrow as pa +from pyarrow import feather +import pandas as pd +from pathlib import Path + +import logging +logger = logging.getLogger("bookworm") + +import json +import hashlib +import random + +def hashcode(query: dict) -> str: + return hashlib.sha1(json.dumps(query).encode("utf-8")).hexdigest() + +class Query_Cache: + # By default, use locally stored feather files. If that's bad, it Would + # be pretty easy to split the class out into anything using an API + # that maps from cache[query_dictionary] -> pandas_frame. + + def __init__(self, location, + max_entries = 256, + max_length = 2**8, + cold_storage = None): + """ + location: where to keep some cached queries as parquet. + max_entries: the max size of the cache. + max_length: row length above which a query is never cached. + cold_storage: Optional location of a second, read-only cache. + Feather files in this can be nested at any depth. + """ + self.location = location + self.max_entries = max_entries + self.max_length = max_length + self.precache = {} + + if not Path(location).exists(): + Path(location).mkdir(parents = True) + assert Path(location).is_dir() + if cold_storage is not None: + for path in Path(cold_storage).glob("**/*.feather"): + code = str(path.with_suffix("").name) + self.precache[code] = path + + def filepath(self, query: dict) -> Path: + code = hashcode(query) + if code in self.precache: + return self.precache[code] + return (Path(self.location) / code).with_suffix(".feather") + + def __getitem__(self, query: dict) -> pd.DataFrame: + if hashcode(query) in self.precache: + # First check any manual queries. +# print(self.precache[hashcode(query)]) + return feather.read_feather(self.precache[hashcode(query)]) + + p = self.filepath(query) + table = feather.read_feather(p) + p.touch() # Note access for LRU cache flushing. + return table + + def __setitem__(self, query: dict, table: pd.DataFrame): + if table.shape[0] > self.max_length: + return + if not self.max_length: + # 0 or None are both reasonable here. + return + path = self.filepath(query).open(mode="wb") + feather.write_feather(table, path, compression = "zstd") + + def trim_cache(self): + """ + Remove all cached feather files except the first + few (defined by the max_entries parameter of the class.) + """ + files = Path(self.location).glob("*.feather") + all_of_em = [] + for file in files: + all_of_em.append((-1 * file.stat().st_mtime, file)) + all_of_em.sort() + for extra in all_of_em[self.max_entries:]: + try: + extra[1].unlink() + except: + logger.error(f"Unable to unlink file {extra}; assuming another thread got it first, although that's pretty unlikely!") diff --git a/bookwormDB/schema_primitives.py b/bookwormDB/schema_primitives.py index ad62cc4..13fe934 100644 --- a/bookwormDB/schema_primitives.py +++ b/bookwormDB/schema_primitives.py @@ -9,7 +9,7 @@ "$id": "#/properties/counttype/items", "type": "string", "default": "WordCount", - "enum": agg_keys + "enum": agg_keys, "pattern": "^(.*)$" } } diff --git a/bookwormDB/search_limits.py b/bookwormDB/search_limits.py index 75cb145..0dd5ae6 100644 --- a/bookwormDB/search_limits.py +++ b/bookwormDB/search_limits.py @@ -1,4 +1,3 @@ -import MySQLdb def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_joiner = " OR "): whereterm = [] @@ -14,22 +13,24 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ values = [values] # Or queries are special, since the default is "AND". This toggles that around for a subportion. - if key == "$or" or key == "$OR": + if key == "$or": local_set = [] for comparison in values: local_set.append(where_from_hash(comparison, comp=comp)) whereterm.append(" ( " + " OR ".join(local_set) + " )") - elif key == '$and' or key == "$AND": + elif key == '$and': for comparison in values: whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) elif isinstance(values, dict): if joiner is None: joiner = " AND " - # Certain function operators can use MySQL terms. + # Certain function operators can use Mongo terms. # These are the only cases that a dict can be passed as a limitations - operations = {"$gt":">", "$ne":"!=", "$lt":"<", - "$grep":" REGEXP ", "$gte":">=", - "$lte":"<=", "$eq":"="} + operations = { + "$gt" : ">", "$ne" : "!=", "$lt" : "<", + "$grep" : " REGEXP ", "$gte" : ">=", + "$lte" : "<=", "$eq" : "=" + } for operation in list(values.keys()): if operation == "$ne": @@ -59,10 +60,10 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ def escape(value): # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. - return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8') + return value else: def escape(value): - return to_unicode(value) + return value quotesep = "" joined = list_joiner.join([" ({}{}{}{}{}) ".format(key, comp, quotesep, escape(value), quotesep) for value in values]) diff --git a/bookwormDB/sqliteKV.py b/bookwormDB/sqliteKV.py deleted file mode 100644 index c3dcb6d..0000000 --- a/bookwormDB/sqliteKV.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright © 2018 Sylvain PULICANI -# Super heavily changed by Ben Schmidt; the old version was a true -# kv store, this one just autoincrements a lookup table. - -# This should generally be thread safe for reads, but not for writes. -# If multip - -# This work is free. You can redistribute it and/or modify it under the -# terms of the Do What The Fuck You Want To Public License, Version 2, -# as published by Sam Hocevar. See the COPYING file for more details. - -# sqlite_kv.py -# -# Python implementation of the SQLiteKV store. - -import sqlite3 - - -class KV: - """ - Python implementation of the SQLiteKV store, with additionnal methods - to make it more pythonic. - ..Warning:: - * The `close` method has to be called after use. - * The `delete` method is not yet implemented. - """ - def __init__(self, dbfile): - """ - Open a connection to the SQLite file. If it doesn't exists, create it - and add the needed tables. - """ - self.conn = None - self.conn = sqlite3.connect(dbfile, detect_types=sqlite3.PARSE_DECLTYPES) - self.conn.row_factory = sqlite3.Row - - tables = [dict(r)['name'] for r in self.conn.execute( - "SELECT name FROM sqlite_master WHERE type='table'")] - - if 'keys' not in tables: - self.conn.execute("""CREATE TABLE keys( - ID INTEGER PRIMARY KEY ASC, - key TEXT UNIQUE NOT NULL)""") - - self.conn.execute("CREATE UNIQUE INDEX idx_keys ON keys(key)") - - - def close(self): - """ - Properly close the database. - """ - self.conn.commit() - self.conn.close() - - def __getitem__(self, key): - rows = self.conn.execute("""SELECT ID FROM keys - WHERE keys.key=(?)""", (key, )) - row = rows.fetchone() - if row is None: - raise KeyError(key) - return row['ID'] - - def register(self, key): - self.conn.execute("INSERT INTO keys(key) VALUES (?)", - (key, )) - diff --git a/bookwormDB/store.py b/bookwormDB/store.py new file mode 100644 index 0000000..7aadd8a --- /dev/null +++ b/bookwormDB/store.py @@ -0,0 +1,26 @@ +# Just a place to store per-process configurations rather than pass through a +# nest of functions. Bad idea? Probably--I got it from too much Javascript. +# Only one location should ever have write +# access, certainly. But this should be easier to disentangle than endless passed 'args.whatever' + +import yaml +from pathlib import Path + +store_dict = { + 'duckdb_directory': Path(".") +} + +directories = [Path("."), Path("/var/lib/bookworm/"), *Path(".").parents, Path("~").expanduser()] +directories.reverse() # because we want the immediate parent first. + +for dir in directories: + for file in [".bookworm.yaml", ".bookworm.yml", "bookworm.yaml"]: + p = dir / file + if p.exists(): + print("Loading", dir) + store_dict = yaml.safe_load(p.open()) + +def store(): + global store_dict + return store_dict + diff --git a/bookwormDB/tokenizer.py b/bookwormDB/tokenizer.py deleted file mode 100644 index b043550..0000000 --- a/bookwormDB/tokenizer.py +++ /dev/null @@ -1,327 +0,0 @@ -#! /usr/bin/python - -from __future__ import print_function -import random -import sys -import os -from .sqliteKV import KV -import time -import logging -import numpy as np -from pandas import read_csv -from io import StringIO - -""" -This section does a lot of work on tokenizing and aggregating wordcounts. -""" - -# import regex as re --now done only when the function is actually called. -# Set at a global to avoid multiple imports. - -re = None - -# Likewise, store a thread-wise count on whether we've thrown a unicode encoding error. -haveWarnedUnicode = False -# And the default regex is generated by a function on demand. -bigregex = None - - -def wordRegex(): - """ - #I'm including the code to create the regex, which makes it more readable. - Note that this uses *unicode*: among other things, that means that it needs to be passed - a unicode-decoded string: and that we have to use the "regex" module instead of the "re" module. Python3 will make this, perhaps, easier. - """ - global re - if re is None: - import regex as re - MasterExpression = r"\w+" - possessive = MasterExpression + r"'s" - numbers = r"(?:[\$])?\d+" - decimals = numbers + r"\.\d+" - abbreviation = r"(?:mr|ms|mrs|dr|prof|rev|rep|sen|st|sr|jr|ft|gen|adm|lt|col|etc)\." - sharps = r"[a-gjxA-GJX]#" - punctuators = r"[^\w\p{Z}]" - """ - Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms - """ - bigregex = re.compile("|".join([decimals,possessive,numbers,abbreviation,sharps,punctuators,MasterExpression]),re.UNICODE|re.IGNORECASE) - return bigregex - - -def readDictionaryFile(prefix=""): - look = dict() - for line in open(prefix + ".bookworm/texts/wordlist/wordlist.txt"): - line = line.rstrip("\n") - v, k, _ = line.split("\t") - look[k] = v - return look - -def readIDfile(prefix=""): - if not os.path.exists(".bookworm/metadata/textids.sqlite"): - raise FileNotFoundError("No textids DB: run `bookworm build textids`") - return KV(prefix + ".bookworm/metadata/textids.sqlite") - -class tokenBatches(object): - """ - A tokenBatches is a manager for tokenizers. Each one corresponds to - a reasonable number of texts to read in to memory on a single processor: - during the initial loads, there will probably be one per core. - It doesn't store the original text, just the unigram and bigram tokenizations in its attached self.counts arrays. - - It writes out its dat to a single file: - in this way, a batch of up to several hundred thousand individual files is grouped into a single file. - - It also has a method that encodes and writes its wordcounts into a tsv file appropriate for reading with mysql, - with 3-byte integer encoding for wordid and bookid. - """ - - def __init__(self, levels=["unigrams","bigrams"]): - """ - - mode: 'encode' (write files out) - """ - self.id = '%030x' % random.randrange(16**30) - self.levels=levels - - # placeholder to alert that createOutputFiles must be run. - self.completedFile = None - - def createOutputFiles(self): - self.completedFile = open(".bookworm/texts/encoded/completed/" + self.id,"w") - self.outputFiles = dict() - for level in self.levels: - self.outputFiles[level] = open(".bookworm/texts/encoded/{}/{}.txt".format(level, self.id),"w") - - def attachDictionaryAndID(self): - self.dictionary = readDictionaryFile() - self.IDfile = readIDfile() - - - def close(self): - """ - This test allows the creation of bookworms with fewer document than requested - threads, which happens to be the case in the tests. - """ - if self.completedFile is not None: - self.completedFile.close() - for v in self.outputFiles.values(): - v.close() - - def encodeRow(self, - filename, - tokenizer, - write_completed=True - ): - """ - 'id': the filename - 'tokenizer': a tokenizer object - - """ - if self.completedFile is None: - self.createOutputFiles() - self.attachDictionaryAndID() - - #The dictionary and ID lookup tables should be pre-attached. - dictionary = self.dictionary - IDfile = self.IDfile - - levels = None - """ - if source=="raw_text": - parts = row.split("\t", 1) - filename = parts[0] - try: - tokens = tokenizer(parts[1]) - except IndexError: - logging.warn("\nFound no tab in the input for '" + filename + "'...skipping row\n") - levels = self.levels - - if source == "countfile": - try: - (filename, token, count) = row.split("\t") - except: - logging.error("Can't find tab\n***************") - logging.error(row) - raise - tokens = preTokenized(token, count, self.levels[0]) - """ - - try: - textid = IDfile[filename] - except KeyError: - logging.warn("Warning: file " + filename + " not found in jsoncatalog.txt, not encoding") - return - - for level in self.levels: - outputFile = self.outputFiles[level] - output = [] - - counts = tokenizer.counts(level) - - for wordset, count in counts.items(): - skip = False - wordList = [] - for word in wordset: - try: - wordList.append(dictionary[word]) - except KeyError: - """ - if any of the words to be included is not in the dictionary, - we don't include the whole n-gram in the counts. - """ - skip = True - if not skip: - wordids = "\t".join(wordList) - output.append("{}\t{}\t{}".format(int(textid), wordids, count)) - - try: - if len(output) > 0: - # The test is necessary because otherwise this prints a blank line. - outputFile.write("\n".join(output) + "\n") - - except IOError as e: - logging.exception(e) - - if write_completed: - self.completedFile.write(filename + "\n") - -class Tokenizer(object): - """ - A tokenizer is initialized with a single text string. - - It assumes that you have in namespace an object called "bigregex" which - identifies words. - - (I'd define it here, but it's a performance optimization to avoid compiling the large regex millions of times.) - - the general way to call it is to initialize, and then for each desired set of counts call "tokenizer.counts("bigrams")" (or whatever). - - That returns a dictionary, whose keys are tuples of length 1 for unigrams, 2 for bigrams, etc., and whose values are counts for that ngram. The tuple form should allow faster parsing down the road. - - """ - - def __init__(self, string, tokenization_regex=None): - global haveWarnedUnicode - self.string = string - self.tokenization_regex = tokenization_regex - self.tokens = None - def tokenize(self): - """ - This tries to return the pre-made tokenization: - if that doesn't exist, it creates it. - """ - if self.tokens is not None: - return self.tokens - """ - For speed, don't import until here. - """ - tokenization_regex=self.tokenization_regex - global re - if re is None: - import regex as re - if tokenization_regex is None: - # by default, use the big regex. - global bigregex - if bigregex==None: - bigregex = wordRegex() - tokenization_regex = bigregex - self.tokens = re.findall(tokenization_regex, self.string) - return self.tokens - - def ngrams(self, n, collapse = False): - """ - All the ngrams in the text can be created as a tuple by zipping an arbitrary number of - copies of the text to itself. - """ - - self.tokenize() - l = list(zip(*[self.tokens[i:] for i in range(n)])) - if collapse: - l = [" ".join(tupled) for tupled in l] - return l - - def unigrams(self): - return self.ngrams(1) - - def bigrams(self): - return self.ngrams(2) - - def trigrams(self): - return self.ngrams(3) - - def allgrams(self, max = 6): - output = [] - for i in range(1, max + 1): - output.extend(self.ngrams(i, collapse = True)) - return output - - def words(self): - """ - 1-grams have tuple keys, but words have index keys. - """ - self.tokenize() - return self.tokens - - def counts(self, whichType): - - count = dict() - for gram in getattr(self,whichType)(): - try: - count[gram] += 1 - except KeyError: - count[gram] = 1 - return count - - -class PreTokenized(object): - """ - This class is a little goofy: it mimics the behavior of a tokenizer - one data that's already been tokenized by something like - Google Ngrams or JStor Data for Research. - """ - - def __init__(self, csv_string, level): - f = read_csv(StringIO(csv_string), - lineterminator = "\f", - # Ugh--want 'NA' to be a word. - dtype = {'word': str, 'counts': np.int}, - keep_default_na=False, - names = ["word", "counts"]) - self.level = level - if level == 'words': - self.output = dict(zip(f.word, f.counts)) - else: - self.output = dict(zip([tuple(w.split(" ")) for w in f.word], f.counts)) - - def counts(self,level): - if level != self.level: - raise - return self.output - - -def getAlreadySeenList(folder): - #Load in a list of what's already been translated for that level. - #Returns a set. - files = os.listdir(folder) - seen = set([]) - for file in files: - for line in open(folder + "/" + file): - seen.add(line.rstrip("\n")) - return seen - -def encode_text_stream(): - seen = getAlreadySeenList(".bookworm/texts/encoded/completed") - tokenBatch = tokenBatches() - tokenBatch.attachDictionaryAndID() - for line in sys.stdin: - filename = line.split("\t",1)[0] - line = line.rstrip("\n") - if filename not in seen: - tokenBatch.encodeRow(line) - - # And printout again at the end - -if __name__=="__main__": - encode_text_stream() - diff --git a/bookwormDB/variableSet.py b/bookwormDB/variableSet.py deleted file mode 100644 index 024704f..0000000 --- a/bookwormDB/variableSet.py +++ /dev/null @@ -1,903 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import json -import os -import decimal -import re -from MySQLdb import escape_string -import logging -import subprocess -from .sqliteKV import KV - -def to_unicode(obj): - if isinstance(obj, bytes): - obj = str(obj) - if isinstance(obj, int) or isinstance(obj, float) or isinstance(obj, decimal.Decimal): - obj = str(obj) - return obj - -def splitMySQLcode(string): - - """ - MySQL code can only be executed one command at a time, and fails if it has any empty slots - So as a convenience wrapper, I'm just splitting it and returning an array. - """ - logging.debug(f"Splitting: '{string}'") - try: - output = ['%s;\n' % query for query in string.split(';') if re.search(r"\w", query)] - output = [o.strip("\\n") for o in output if o != "\\n"] - except AttributeError: - # Occurs when the field is completely empty - output = [] - return output - - -def guessBasedOnNameAndContents(metadataname,dictionary): - """ - This makes a guess based on the data field's name and type. - CUrrently it assumes everything is categorical; that can really chunk out on some text fields, but works much better for importing csvs. Probably we want to take some other things into account as well. - """ - description = {"field":metadataname,"datatype":"categorical","type":"character","unique":True} - - example = list(dictionary.keys())[0] - - if type(example) == int: - description["type"] = "integer" - - if type(example) == list: - description["unique"] = False - - if metadataname == "searchstring": - return {"datatype": "searchstring", "field": "searchstring", "unique": True, "type": "text"} - - if re.search("date",metadataname) or re.search("time",metadataname): - description["datatype"] = "time" - - values = [dictionary[key] for key in dictionary] - averageNumberOfEntries = sum(values)/ len(values) - - if averageNumberOfEntries > 2: - description["datatype"] = "categorical" - - return description - - -class dataField(object): - """ - This define a class that supports a data field from a json definition. - We'll use this to spit out appropriate sql code and JSON where needed. - The 'definition' here means the user-generated array (submitted in json but - parsed out before this) described in the Bookworm interface. - This knows whether it's unique, whether it should treat itself as a date, etc. - - The complicated bits are about allowing fast lookups for arbitrary-length - character lookups: for a variable like "country," it will also create - the new field "country__id" and the table "countryLookup" to allow - faster joins on the main database - """ - - def __init__(self, definition, dbToPutIn, anchorType="MEDIUMINT UNSIGNED", anchor="bookid",table="catalog",fasttab="fastcat"): - #anchorType should be derived from somewhere. - self.anchorType = anchorType - self.anchor = anchor - - for key in definition.keys(): - vars(self)[key] = definition[key] - self.dbToPutIn = dbToPutIn - - #ordinarily, a column has no alias other than itself. - self.alias = self.field - self.status = "hidden" - - #The table it's stored in will be either 'catalog', or a new - #table named after the variable. For now, at least. (later the anchor should get used). - - self.fastField = self.field - self.finalTable = fasttab - if self.datatype == "categorical": - self.type = "character" - #This will catch a common sort of mistake (calling it text), - #but also coerce any categorical data to have fewer than 255 characters. - #This is worth it b/c a more than 255-character field will take *forever* to build. - self.fastField = "%s__id" % self.field - self.alias = self.fastField - #If it's a categorical variable, it will be found in a lookup table. - self.finalTable = self.field + "Lookup" - self.status = "public" - - if self.datatype == "time": - self.status = "public" - - if self.unique: - self.table = table - self.fasttab = fasttab - - else: - self.table = self.field + "Disk" - self.fasttab = self.field + "heap" - self.outputloc = ".bookworm/metadata/%s.txt" % self.field - - - def __repr__(self): - val = "Data Field '{}'".format(self.field) - val += "\n\tdatatype: {}".format(self.datatype) - val += "\n\ttype: {}".format(self.type) - val += "\n\tuniqueness: {}".format(self.unique) - return val - - def slowSQL(self, withIndex=False): - """ - This returns something like "author VARCHAR(255)", - a small definition string with an index, potentially. - """ - - mysqltypes = { - "character": "VARCHAR(255)", - "integer": "INT", - "text": "VARCHAR(5000)", - "decimal": "DECIMAL (9,4)", - "float": "FLOAT" - } - - # Indexing both the field and against the anchor for fast memory table creation. - indexstring = ", INDEX (%(field)s), INDEX (%(anchor)s, %(field)s " % self.__dict__ - #need to specify fixed prefix length on text strings: (http://dev.mysql.com/doc/refman/5.0/en/create-index.html) - # If it's a text field, we need to curtail the index at 255 characters - # or else indexes start timing out or eating up all the memory. - indextypes = { - "character": "%s)" % indexstring, - "integer": "%s)" % indexstring, - "text": "%s (255) )" % indexstring, - "decimal": "%s)" % indexstring - } - createstring = " %s %s" % (self.field, mysqltypes[self.type]) - - if withIndex and self.type != 'text' and self.type != "float": - return '%s%s' % (createstring, indextypes[self.type]) - - return createstring - - def fastSQL(self): - """ - This creates code to go in a memory table: it assumes that the disk - tables are already there, and that a connection cursor is active. - Memory tables in MySQL don't suppor the VARCHAR (they just take up all - 255 characters or whatever); thus, it has to be stored this other way. - """ - if self.datatype != 'etc': - if self.type == "character": - self.setIntType() - return " %(field)s__id %(intType)s" % self.__dict__ - if self.type == "integer": - return " %s INT" % self.field - if self.type == "decimal": - return " %s DECIMAL (9,4) " % self.field - if self.type == "float": - return " %s FLOAT " % self.field - else: - return None - else: - return None - - def buildDiskTable(self,fileLocation="default"): - """ - Builds a disk table for a nonunique variable. - """ - db = self.dbToPutIn - dfield = self - - if fileLocation == "default": - fileLocation = ".bookworm/metadata/" + dfield.field + ".txt" - - logging.info("Making a SQL table to hold the data for " + dfield.field) - - q1 = """DROP TABLE IF EXISTS """ + dfield.field + "Disk" - db.query(q1) - db.query("""CREATE TABLE IF NOT EXISTS """ + dfield.field + """Disk ( - """ + self.anchor + " " + self.anchorType + """, - """ + dfield.slowSQL(withIndex=True) + """ - );""") - db.query("ALTER TABLE " + dfield.field + "Disk DISABLE KEYS;") - loadcode = """LOAD DATA LOCAL INFILE '""" + fileLocation + """' - INTO TABLE """ + dfield.field + """Disk - FIELDS ESCAPED BY '';""" - db.query(loadcode) - # cursor = db.query("""SELECT count(*) FROM """ + dfield.field + """Disk""") - db.query("ALTER TABLE " + dfield.field + "Disk ENABLE KEYS") - - def build_ID_and_lookup_tables(self): - IDcode = self.buildIdTable() - for query in splitMySQLcode(IDcode): - self.dbToPutIn.query(query) - for query in splitMySQLcode(self.fastLookupTableIfNecessary("MYISAM")): - self.dbToPutIn.query(query) - for query in splitMySQLcode(self.fastSQLTable("MYISAM")): - self.dbToPutIn.query(query) - - def fastLookupTableIfNecessary(self, engine="MEMORY"): - - """ - This uses the already-created ID table to create a memory lookup. - """ - self.engine = engine - if self.datatype == 'categorical': - logging.debug("Creating a memory lookup table for " + self.field) - self.setIntType() - self.maxlength = self.dbToPutIn.query("SELECT MAX(CHAR_LENGTH(%(field)s)) FROM %(field)s__id" % self.__dict__) - self.maxlength = self.maxlength.fetchall()[0][0] - self.maxlength = max([self.maxlength,1]) - code = """DROP TABLE IF EXISTS tmp; - CREATE TABLE tmp (%(field)s__id %(intType)s ,PRIMARY KEY (%(field)s__id), - %(field)s VARCHAR (%(maxlength)s) ) ENGINE=%(engine)s - SELECT %(field)s__id,%(field)s FROM %(field)s__id;""" % self.__dict__ - tname = self.field+"Lookup" - if engine=="MYISAM": - tname += "_" - - code += "DROP TABLE IF EXISTS {}; RENAME TABLE tmp to {}".format(tname,tname) - return code - return "" - - def fastSQLTable(self,engine="MEMORY"): - #setting engine to another value will create these tables on disk. - queries = "" - self.engine = engine - tname = self.field + "heap" - if engine=="MYISAM": - tname += "_" - if self.unique and self.anchor=="bookid": - pass #when it has to be part of a larger set - if not self.unique and self.datatype == 'categorical': - self.setIntType() - queries += """DROP TABLE IF EXISTS tmp;""" - queries += """CREATE TABLE tmp (%(anchor)s %(anchorType)s , INDEX (%(anchor)s),%(field)s__id %(intType)s ) ENGINE=%(engine)s; """ % self.__dict__ - if engine=="MYISAM": - queries += "INSERT INTO tmp SELECT %(anchor)s ,%(field)s__id FROM %(field)s__id JOIN %(field)sDisk USING (%(field)s); " % self.__dict__ - elif engine=="MEMORY": - queries += "INSERT INTO tmp SELECT * FROM {}_; ".format(tname) - queries += "DROP TABLE IF EXISTS {}; RENAME TABLE tmp TO {}; ".format(tname,tname) - - if self.datatype == 'categorical' and self.unique: - pass - - return queries - - def jsonDict(self): - """ - DEPRECATED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - #This builds a JSON dictionary that can be loaded into outside - bookworm in the "options.json" file. - It's a bad design decision; newer version - just load this directly from the database. - """ - mydict = dict() - #It gets confusingly named: "type" is the key for real name ("time", "categorical" in the json), but also the mysql key ('character','integer') here. That would require renaming code in a couple places. - mydict['type'] = self.datatype - mydict['dbfield'] = self.field - try: - mydict['name'] = self.name - except: - mydict['name'] = self.field - if self.datatype == "etc" or self.type == "text": - return dict() #(Some things don't go into the fast settings because they'd take too long) - if self.datatype == "time": - mydict['unit'] = self.field - #default to the full min and max date ranges - #times may not be zero or negative - cursor = self.dbToPutIn.query("SELECT MIN(" + self.field + "), MAX(" + self.field + ") FROM catalog WHERE " + self.field + " > 0 ") - results = cursor.fetchall()[0] - mydict['range'] = [results[0], results[1]] - mydict['initial'] = [results[0], results[1]] - - if self.datatype == "categorical": - mydict['dbfield'] = self.field + "__id" - #Find all the variables used more than 20 times from the database, and build them into something json-usable. - cursor = self.dbToPutIn.query("SELECT %(field)s, %(field)s__id FROM %(field)s__id WHERE %(field)s__count > 20 ORDER BY %(field)s__id ASC LIMIT 500;" % self.__dict__) - sort_order = [] - descriptions = dict() - for row in cursor.fetchall(): - code = row[1] - name = row[0] - code = to_unicode(code) - sort_order.append(code) - descriptions[code] = dict() - """ - These three things all have slightly different meanings: - the english name, the database code for that name, and the short display name to show. - It would be worth allowing lookup files for these: for now, they are what they are and can be further improved by hand. - """ - descriptions[code]["dbcode"] = code - descriptions[code]["name"] = name - descriptions[code]["shortname"] = name - mydict["categorical"] = {"descriptions": descriptions, "sort_order": sort_order} - - return mydict - - def setIntType(self): - try: - alreadyExists = self.intType - except AttributeError: - cursor = self.dbToPutIn.query("SELECT count(DISTINCT "+ self.field + ") FROM " + self.table) - self.nCategories = cursor.fetchall()[0][0] - self.intType = "INT UNSIGNED" - if self.nCategories <= 16777215: - self.intType = "MEDIUMINT UNSIGNED" - if self.nCategories <= 65535: - self.intType = "SMALLINT UNSIGNED" - if self.nCategories <= 255: - self.intType = "TINYINT UNSIGNED" - - def buildIdTable(self, minimum_occurrence_rate = 1/100000): - - """ - This builds an integer crosswalk ID table with a field that stores categorical - information in the fewest number of bytes. This is important because it can take - significant amounts of time to group across categories if they are large: - for example, with 4 million newspaper articles, on one server a GROUP BY with - a 12-byte VARCHAR field takes 5.5 seconds, but a GROUP BY with a 3-byte MEDIUMINT - field corresponding exactly to that takes 2.2 seconds on the exact same data. - That sort of query is included in every single bookworm - search multiple times, so it's necessary to optimize. - Plus, it means we can save space on memory storage - in important ways as well. - """ - #First, figure out how long the ID table has to be and make that into a datatype. - #Joins and groups are slower the larger the field grouping on, so this is worth optimizing. - self.setIntType() - - returnt = "DROP TABLE IF EXISTS tmp;\n\n" - - returnt += "CREATE TABLE tmp ENGINE=MYISAM SELECT %(field)s,count(*) as count FROM %(table)s GROUP BY %(field)s;\n\n" % self.__dict__ - - # XXXX to fix - # Hardcoding this for now at one per 100K in the method definition. Could be user-set. - n_documents = self.dbToPutIn.query("SELECT COUNT(*) FROM catalog").fetchall()[0][0] - self.minimum_count = round(n_documents*minimum_occurrence_rate) - # XXXX - - returnt +="DELETE FROM tmp WHERE count < %(minimum_count)s;" % self.__dict__ - - returnt += "DROP TABLE IF EXISTS %(field)s__id;\n\n" % self.__dict__ - - returnt += """CREATE TABLE IF NOT EXISTS %(field)s__id ( - %(field)s__id %(intType)s PRIMARY KEY AUTO_INCREMENT, - %(field)s VARCHAR (255), INDEX (%(field)s, %(field)s__id), %(field)s__count MEDIUMINT UNSIGNED);\n\n""" % self.__dict__ - - returnt += """INSERT INTO %(field)s__id (%(field)s,%(field)s__count) - SELECT %(field)s,count FROM tmp LEFT JOIN %(field)s__id USING (%(field)s) WHERE %(field)s__id.%(field)s__id IS NULL - ORDER BY count DESC;\n\n""" % self.__dict__ - - returnt += """DROP TABLE tmp;\n\n""" - - self.idCode = "%s__id" % self.field - return returnt - - def clear_associated_memory_tables(self): - """ - Remove all data from memory tables associated with this variable. - Useful when refreshing the database. - """ - db = self.dbToPutIn - def exists(tablename): - return len(db.query("SHOW TABLES LIKE '" + tablename + "'").fetchall())>0 - if exists(self.fasttab): - logging.debug("DELETING FROM " + self.fasttab) - self.dbToPutIn.query("DELETE FROM " + self.fasttab) - if not self.unique: - if exists(self.field+"heap"): - self.dbToPutIn.query("DELETE FROM " + self.field + "heap") - if self.datatype=="categorical": - if exists(self.field+"Lookup"): - self.dbToPutIn.query("DELETE FROM " + self.field+"Lookup") - - def updateVariableDescriptionTable(self): - self.memoryCode = self.fastLookupTableIfNecessary() - code = """DELETE FROM masterVariableTable WHERE dbname="%(field)s"; - INSERT INTO masterVariableTable - (dbname, name, type, tablename, anchor, alias, status,description) - VALUES - ('%(field)s','%(field)s','%(type)s','%(finalTable)s','%(anchor)s','%(alias)s','%(status)s','') """ % self.__dict__ - self.dbToPutIn.query(code) - if not self.unique: - code = self.fastSQLTable() - try: - parentTab = self.dbToPutIn.query(""" - SELECT tablename FROM masterVariableTable - WHERE dbname='%s'""" % self.fastAnchor).fetchall()[0][0] - except: - parentTab="fastcat" - self.dbToPutIn.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="%s";' % (self.field + "heap")) - q = "INSERT INTO masterTableTable VALUES (%s,%s,%s)" - self.dbToPutIn.query(q, (self.field + "heap", parentTab, code)) - if self.datatype=="categorical": - #Variable Info - - code = """ - DELETE FROM masterVariableTable WHERE dbname='%(field)s__id'; - INSERT IGNORE INTO masterVariableTable - (dbname, name, type, tablename, - anchor, alias, status,description) - VALUES - ('%(field)s__id','%(field)s','lookup','%(fasttab)s', - '%(anchor)s','%(alias)s','hidden','') """ % self.__dict__ - self.dbToPutIn.query(code) - #Separate Table Info - code = self.fastLookupTableIfNecessary() - self.dbToPutIn.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="%s";' %(self.field + "Lookup")) - -# code = escape_string(code) -# if isinstance(code, bytes): -# code = str(code, 'utf-8') -# if (code.startswith(b'b')): -# print("\n\n") -# print(code) - -# self.dbToPutIn.query(q) - - q = "INSERT INTO masterTableTable VALUES (%s, %s, %s)" - - self.dbToPutIn.query(q, (self.field+"Lookup", self.fasttab, code)) - - -# Ugh! This could probably be solved just by putting a lot of -# backticks in the code! - -mySQLreservedWords = set(["ACCESSIBLE", "ADD", -"ALL", "ALTER", "ANALYZE", "AND", "AS", "ASC", "ASENSITIVE", "BEFORE", -"BETWEEN", "BIGINT", "BINARY", "BLOB", "BOTH", "BY", "CALL", -"CASCADE", "CASE", "CHANGE", "CHAR", "CHARACTER", "CHECK", "COLLATE", -"COLUMN", "CONDITION", "CONSTRAINT", "CONTINUE", "CONVERT", "CREATE", -"CROSS", "CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", -"CURRENT_USER", "CURSOR", "DATABASE", "DATABASES", "DAY_HOUR", -"DAY_MICROSECOND", "DAY_MINUTE", "DAY_SECOND", "DEC", "DECIMAL", -"DECLARE", "DEFAULT", "DELAYED", "DELETE", "DESC", "DESCRIBE", -"DETERMINISTIC", "DISTINCT", "DISTINCTROW", "DIV", "DOUBLE", "DROP", -"DUAL", "EACH", "ELSE", "ELSEIF", "ENCLOSED", "ESCAPED", "EXISTS", -"EXIT", "EXPLAIN", "FALSE", "FETCH", "FLOAT", "FLOAT4", "FLOAT8", -"FOR", "FORCE", "FOREIGN", "FROM", "FULLTEXT", "GENERAL", "GRANT", -"GROUP", "HAVING", "HIGH_PRIORITY", "HOUR_MICROSECOND", "HOUR_MINUTE", -"HOUR_SECOND", "IF", "IGNORE", "IGNORE_SERVER_IDS", "IN", "INDEX", -"INFILE", "INNER", "INOUT", "INSENSITIVE", "INSERT", "INT", "INT1", -"INT2", "INT3", "INT4", "INT8", "INTEGER", "INTERVAL", "INTO", "IS", -"ITERATE", "JOIN", "KEY", "KEYS", "KILL", "LEADING", "LEAVE", "LEFT", -"LIKE", "LIMIT", "LINEAR", "LINES", "LOAD", "LOCALTIME", -"LOCALTIMESTAMP", "LOCK", "LONG", "LONGBLOB", "LONGTEXT", "LOOP", -"LOW_PRIORITY", "MASTER_HEARTBEAT_PERIOD[c]", -"MASTER_SSL_VERIFY_SERVER_CERT", "MATCH", "MAXVALUE", "MEDIUMBLOB", -"MEDIUMINT", "MEDIUMTEXT", "MIDDLEINT", "MINUTE_MICROSECOND", -"MINUTE_SECOND", "MOD", "MODIFIES", "NATURAL", "NOT", -"NO_WRITE_TO_BINLOG", "NULL", "NUMERIC", "ON", "OPTIMIZE", "OPTION", -"OPTIONALLY", "OR", "ORDER", "OUT", "OUTER", "OUTFILE", "PRECISION", -"PRIMARY", "PROCEDURE", "PURGE", "RANGE", "READ", "READS", -"READ_WRITE", "REAL", "REFERENCES", "REGEXP", "RELEASE", "RENAME", -"REPEAT", "REPLACE", "REQUIRE", "RESIGNAL", "RESTRICT", "RETURN", -"REVOKE", "RIGHT", "RLIKE", "SCHEMA", "SCHEMAS", "SECOND_MICROSECOND", -"SELECT", "SENSITIVE", "SEPARATOR", "SET", "SHOW", "SIGNAL", -"SLOW[d]", "SMALLINT", "SPATIAL", "SPECIFIC", "SQL", "SQLEXCEPTION", -"SQLSTATE", "SQLWARNING", "SQL_BIG_RESULT", "SQL_CALC_FOUND_ROWS", -"SQL_SMALL_RESULT", "SSL", "STARTING", "STRAIGHT_JOIN", "TABLE", -"TERMINATED", "THEN", "TINYBLOB", "TINYINT", "TINYTEXT", "TO", -"TRAILING", "TRIGGER", "TRUE", "UNDO", "UNION", "UNIQUE", "UNLOCK", -"UNSIGNED", "UPDATE", "USAGE", "USE", "USING", "UTC_DATE", "UTC_TIME", -"UTC_TIMESTAMP", "VALUES", "VARBINARY", "VARCHAR", "VARCHARACTER", -"VARYING", "WHEN", "WHERE", "WHILE", "WITH", "WRITE", "XOR", -"YEAR_MONTH", "ZEROFILL", "WORDS", "NWORDS", "WORD", "UNIGRAM"]) - -class variableSet(object): - def __init__(self, - originFile=".bookworm/metadata/jsoncatalog_derived.txt", - anchorField="bookid", - jsonDefinition=None, - db=None): - self.db = db - self.anchorField = anchorField - self.originFile=originFile - self.jsonDefinition=jsonDefinition - logging.debug(jsonDefinition) - - if jsonDefinition==None: - logging.warning("No field_descriptions.json file provided, so guessing based " - "on variable names.") - self.jsonDefinition=self.guessAtFieldDescriptions() - else: - with open(jsonDefinition,"r") as fin: - self.jsonDefinition = json.loads(fin.read()) - - self.setTableNames() - self.catalogLocation = ".bookworm/metadata/" + self.tableName + ".txt" - - - self.variables = [] - - for item in self.jsonDefinition: - #The anchor field has special methods hard coded in. - - if item['field'] == self.anchorField: - continue - if item['field'].upper() in mySQLreservedWords: - logging.warning(item['field'] + """ is a reserved word in MySQL, so can't be used as a Bookworm field name: skipping it for now, but you probably want to rename it to something different""") - item['field'] = item['field'] + "___" - continue - self.variables.append(dataField(item,self.db,anchor=anchorField,table=self.tableName,fasttab=self.fastName)) - - def __repr__(self): - return "A variable set of {} objects".format(len(self.variables)) - - def setTableNames(self): - """ - For the base case, they're catalog and fastcat: otherwise, it's just they key - and the first variable associated with it. - """ - if os.path.split(self.originFile)[-1] == 'jsoncatalog_derived.txt': - self.tableName = "catalog" - self.fastName = "fastcat" - - else: - try: - self.tableName = self.jsonDefinition[0]['field'] + "_" + self.jsonDefinition[1]['field'] - except IndexError: - #if it's only one element long, just name it after the variable itself. - #Plus the string 'unique', to prevent problems of dual-named tables; - self.tableName = "unick_" + self.jsonDefinition[0]['field'] - - self.fastName = self.tableName + "heap" - - def guessAtFieldDescriptions(self,stopAfter=30000): - allMyKeys = dict() - unique = True - - for i, line in enumerate(open(self.originFile)): - try: - entry = json.loads(line.rstrip("\n")) - except: - logging.warning("Error in line {} of {}".format(i, self.originFile)) - logging.warning(line) - - for key in entry: - if type(entry[key])==list: - unique=False - else: - #Treat it for counting sake as a single element list. - entry[key] = [entry[key]] - for value in entry[key]: - try: - allMyKeys[key][value] += 1 - except KeyError: - try: - allMyKeys[key][value] = 1 - except KeyError: - allMyKeys[key] = dict() - allMyKeys[key][value] = 1 - if i > stopAfter: - break - - myOutput = [] - - for metadata in allMyKeys: - - bestGuess = guessBasedOnNameAndContents(metadata,allMyKeys[metadata]) - if unique==False: - bestGuess['unique'] = False - - myOutput.append(bestGuess) - - myOutput = [output for output in myOutput if output["field"] != "filename"] - - return myOutput - - def uniques(self,type="base"): - """ - Some frequent patterns that tend to need to be iterated through. - """ - - if type=="base": - return [variable for variable in self.variables if variable.unique] - if type=="fast": - return [variable for variable in self.variables if (variable.unique and variable.fastSQL() is not None)] - if type=="categorical": - return [variable for variable in self.variables if (variable.unique and variable.fastSQL() is not None and variable.datatype=="categorical")] - - def notUniques(self): - return [variable for variable in self.variables if not variable.unique] - - def anchorLookupDictionary(self): - db = self.db - anchor = self.anchorField - self.fastAnchor = self.anchorField - - if anchor == "bookid" and self.tableName != "catalog": - self.fastAnchor="bookid" - bookids = DummyDict() - - elif anchor=="filename" or anchor=="bookid": - self.fastAnchor = "bookid" - bookids = dict() - try: - """ - It is faster, better, and (on the first run only) sometimes necessary - to pull the textids from the original files, not the database. - """ - bookids = KV(".bookworm/metadata/textids.sqlite") - for variable in self.variables: - variable.anchor=self.fastAnchor - except IOError: - logging.info("Pulling bookids from catalog...") - results = db.query("SELECT bookid,filename FROM catalog;") - logging.info("... bookids have been retrieved.") - for row in results.fetchall(): - bookids[row[1]] = row[0] - logging.info("... and are loaded into a dictionary.") - for variable in self.variables: - variable.anchor=self.fastAnchor - else: - query = """SELECT alias FROM masterVariableTable WHERE dbname='%s'""" % (anchor) - bookids = dict() - cursor = db.query("SELECT alias FROM masterVariableTable WHERE dbname = '%s'" % anchor) - try: - fastAnchor = cursor.fetchall()[0][0] - except: - if anchor in ["bookid","filename"]: - fastAnchor="bookid" - logging.warning("Unable find an alias in the DB for anchor" + anchor + "\n\n") - self.fastAnchor=fastAnchor - if fastAnchor != anchor: - results = db.query("SELECT * FROM %sLookup_;" % (anchor)) - for row in results.fetchall(): - bookids[row[1]] = row[0] - self.anchor=fastAnchor - for variable in self.variables: - variable.anchor = fastAnchor - else: - #construct a phony dictionary that just returns what you gave - bookids = DummyDict() - - return bookids - - def writeMetadata(self,limit=float("Inf")): - #Write out all the metadata into files that MySQL is able to read in. - """ - This is a general purpose, with a few special cases for the primary use case that this is the - "catalog" table that hold the primary lookup information. - """ - linenum = 1 - variables = self.variables - bookids = self.anchorLookupDictionary() - - metadatafile = open(self.originFile) - - - #Open files for writing to - path = os.path.dirname(self.catalogLocation) - try: - os.makedirs(path) - except OSError: - if not os.path.isdir(path): - raise - - catalog = open(self.catalogLocation, 'w') - - for variable in [variable for variable in variables if not variable.unique]: - variable.output = open(variable.outputloc, 'w') - - for entry in metadatafile: - - try: - entry = json.loads(entry) - except: - logging.warning("""WARNING: json parsing failed for this JSON line: - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n""" + entry) - - continue - - #We always lead with the bookid and the filename. - #Unicode characters in filenames may cause problems? - if self.anchorField=="bookid" and self.tableName=="catalog": - self.anchorField="filename" - - filename = to_unicode(entry[self.anchorField]) - - try: - bookid = bookids[entry[self.anchorField]] - except KeyError: - if self.tableName=="catalog": - logging.warning("No entry for {}".format(entry[self.anchorField])) - continue - # bookid = bookids.bump(entry[self.anchorField]) - else: - #If the key isn't in the name table, we have no use for this entry. - continue - mainfields = [str(bookid),to_unicode(entry[self.anchorField])] - - if self.tableName != "catalog": - #It can get problematic to have them both, so we're just writing over the - #anchorField here. - mainfields = [str(bookid)] - # First, pull the unique variables and write them to the 'catalog' table - - for var in [variable for variable in variables if variable.unique]: - if var.field not in [self.anchorField,self.fastAnchor]: - myfield = entry.get(var.field, "") - if myfield is None: - myfield = '' - mainfields.append(to_unicode(myfield)) - catalogtext = '%s\n' % '\t'.join(mainfields) - try: - catalog.write(catalogtext) - except TypeError: - catalog.write(catalogtext) - - for variable in [variable for variable in variables if not variable.unique]: - # Each of these has a different file it must write to... - outfile = variable.output - lines = entry.get(variable.field, []) - if isinstance(lines, (str, bytes, int)): - """ - Allow a single element to be represented as a string - """ - lines = [lines] - if lines==None: - lines = [] - for line in lines: - try: - writing = '%s\t%s\n' % (str(bookid), to_unicode(line)) - outfile.write(writing) - except: - logging.warning("some sort of error with bookid no. " +str(bookid) + ": " + json.dumps(lines)) - pass - if linenum > limit: - break - linenum=linenum+1 - for variable in [variable for variable in variables if not variable.unique]: - variable.output.close() - catalog.close() - metadatafile.close() - - def loadMetadata(self): - """ - Load in the metadata files which have already been created elsewhere. - """ - - #This function is called for the sideffect of assigning a `fastAnchor` field - bookwormcodes = self.anchorLookupDictionary() - db = self.db - logging.info("Making a SQL table to hold the catalog data") - - if self.tableName=="catalog": - """A few necessary basic fields""" - mysqlfields = ["bookid MEDIUMINT UNSIGNED, PRIMARY KEY(bookid)", "filename VARCHAR(255)", "nwords INT"] - else: - mysqlfields = ["%s MEDIUMINT UNSIGNED, PRIMARY KEY (%s)" % (self.fastAnchor,self.fastAnchor)] - for variable in self.uniques(): - createstring = variable.slowSQL(withIndex=True) - mysqlfields.append(createstring) - - if len(mysqlfields) > 1: - #This creates the main (slow) catalog table - db.query("""DROP TABLE IF EXISTS %s """ % self.tableName) - createcode = """CREATE TABLE IF NOT EXISTS %s ( - """ % self.tableName + ",\n".join(mysqlfields) + ") ENGINE=MYISAM;" - try: - db.query(createcode) - except: - logging.error("Unable to create table for metadata: SQL Code follows") - logging.error(createcode) - raise - #Never have keys before a LOAD DATA INFILE - db.query("ALTER TABLE %s DISABLE KEYS" % self.tableName) - logging.info("loading data into %s using LOAD DATA LOCAL INFILE..." % self.tableName) - anchorFields = self.fastAnchor - - if self.tableName=="catalog": - anchorFields = "bookid,filename" - - loadEntries = { - "catLoc": self.catalogLocation, - "tabName": self.tableName, - "anchorFields": anchorFields, - "loadingFields": anchorFields + "," + ','.join([field.field for field in self.variables if field.unique]) - } - - loadEntries['loadingFields'] = loadEntries['loadingFields'].rstrip(',') - logging.debug("loading in data from " + self.catalogLocation) - loadcode = """LOAD DATA LOCAL INFILE '%(catLoc)s' - INTO TABLE %(tabName)s FIELDS ESCAPED BY '' - (%(loadingFields)s)""" % loadEntries - - db.query(loadcode) - logging.info("enabling keys on %s" %self.tableName) - db.query("ALTER TABLE %s ENABLE KEYS" % self.tableName) - - #If there isn't a 'searchstring' field, it may need to be coerced in somewhere hereabouts - - #This here stores the number of words in between catalog updates, so that the full word counts only have to be done once since they're time consuming. - if self.tableName=="catalog": - self.createNwordsFile() - - for variable in self.notUniques(): - variable.buildDiskTable() - - for variable in self.variables: - if variable.datatype=="categorical": - variable.build_ID_and_lookup_tables() - - if len(self.uniques()) > 0 and self.tableName!="catalog": - #catalog has separate rules handled in CreateDatabase.py. - fileCommand = self.uniqueVariableFastSetup("MYISAM") - for query in splitMySQLcode(fileCommand): - db.query(query) - - def uniqueVariableFastSetup(self,engine="MEMORY"): - fileCommand = "DROP TABLE IF EXISTS tmp;" - fileCommand += "CREATE TABLE tmp ({} MEDIUMINT UNSIGNED, PRIMARY KEY ({}), ".format( - self.fastAnchor,self.fastAnchor - ) - fileCommand += ",\n".join([variable.fastSQL() for variable in self.variables if (variable.unique and variable.fastSQL() is not None)]) - fileCommand += ") ENGINE=%s;\n" % engine - - fast_fields = self.fastAnchor + ", " + ",".join([variable.fastField for variable in self.variables if variable.unique and variable.fastSQL() is not None]) - - fileCommand += "INSERT INTO tmp SELECT " + fast_fields - fileCommand += " FROM %s " % self.tableName - fileCommand += " ".join([" JOIN %(field)s__id USING (%(field)s ) " % variable.__dict__ for variable in self.variables if variable.unique and variable.fastSQL() is not None and variable.datatype=="categorical"])+ ";\n" - - name = self.fastName - if engine=="MYISAM": - name += "_" - fileCommand += "DROP TABLE IF EXISTS %s;\n" % name - fileCommand += "RENAME TABLE tmp TO %s;\n" % name - - return fileCommand - - def updateMasterVariableTable(self): - """ - All the categorical variables get a lookup table; - we store the create code in the databse; - """ - for variable in self.variables: - # Make sure the variables know who their parent is - variable.fastAnchor = self.fastAnchor - # Update the referents for everything - variable.updateVariableDescriptionTable() - - inCatalog = self.uniques() - if len(inCatalog) > 0 and self.tableName!="catalog": - #catalog has separate rules handled in CreateDatabase.py; so this builds - #the big rectangular table otherwise. - #It will fail if masterTableTable doesn't exister. - fileCommand = self.uniqueVariableFastSetup() - try: - parentTab = self.db.query(""" - SELECT tablename FROM masterVariableTable - WHERE dbname='%s'""" % self.fastAnchor).fetchall()[0][0] - except: - if self.fastAnchor=="bookid": - parentTab="fastcat" - else: - logging.error("Unable to find a table to join the anchor (%s) against" % self.fastAnchor) - raise - self.db.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="%s";' %self.fastName) - self.db.query("INSERT INTO masterTableTable VALUES (%s, %s, %s)", (self.fastName,parentTab,escape_string(fileCommand))) - - def createNwordsFile(self): - """ - A necessary supplement to the `catalog` table. - """ - db = self.db - - db.query("CREATE TABLE IF NOT EXISTS nwords (bookid MEDIUMINT UNSIGNED, PRIMARY KEY (bookid), nwords INT);") - db.query("UPDATE catalog JOIN nwords USING (bookid) SET catalog.nwords = nwords.nwords") - db.query("INSERT INTO nwords (bookid,nwords) SELECT catalog.bookid,sum(count) FROM catalog LEFT JOIN nwords USING (bookid) JOIN master_bookcounts USING (bookid) WHERE nwords.bookid IS NULL GROUP BY catalog.bookid") - db.query("UPDATE catalog JOIN nwords USING (bookid) SET catalog.nwords = nwords.nwords") - - - -class DummyDict(dict): - """ - Stupid little hack. - Looks like a dictionary, but just returns itself. - Used in cases where we don't actually need the dictionary. - """ - # we need to have it there. - def __missing__(self,key): - return key diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index 9cf4b68..da28148 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -1,28 +1,74 @@ -from bookwormDB.general_API import SQLAPIcall as SQLAPIcall +from bookwormDB.general_API import DuckDBCall, Caching_API, ProxyAPI import json from urllib.parse import unquote import logging +logger = logging.getLogger("bookworm") + import multiprocessing import gunicorn.app.base +from bookwormDB.store import store +from .store import store +from .query_cache import Query_Cache +from pathlib import Path +import duckdb + from datetime import datetime + def content_type(query): try: format = query['format'] except: return 'text/plain' - + if format == "json": return "application/json" - - if format == "feather": + + if format == "feather" or format == "parquet": return "application/octet-stream" - + if format == "html": return "text/html" - + return 'text/plain' + +args = store()['args'] +if args.cache != "none": + query_cache = Query_Cache( + args.cache, + max_entries = 256, + max_length = 2**8, + cold_storage = args.cold_storage) + + +class DuckPool(dict): + def __missing__(self, key): + # Mother duck said 'quack quack quack quack' + # and all of her five little duckies came back. + duck_dir = args.db_directory + self[key] = duckdb.connect(str(Path(duck_dir) / key), read_only = True) + return self[key] + def options(self): + duck_dir = args.db_directory + return [f.name for f in args.db_directory.glob("*") if f.is_file()] + +duck_connections = DuckPool() + +if args.remote_host is None: + logger.info("Using SQL API") + API = DuckDBCall + API_kwargs = {} + +else: + logger.info("Using proxy API") + API = ProxyAPI + API_kwargs = { + "endpoint": args.remote_host + } + + + def application(environ, start_response, logfile = "bookworm_queries.log"): # Starting with code from http://wsgi.tutorial.codepoint.net/parsing-the-request-post try: @@ -37,13 +83,18 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): q = environ.get('QUERY_STRING') try: ip = environ.get('HTTP_X_FORWARDED_FOR') - # logging.debug("Request from {}".format(ip)) except: ip = environ.get('REMOTE_ADDR') if ip is None: ip = environ.get('REMOTE_ADDR') + + # Caching IPs directly is probably in violation of GPDR. + # It's nice to have session browsing data, so we'll grab just the + # last byte which should be enough to get something out of. + ip = ip.split(".")[-1] + query = unquote(q) - + headers = { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET, POST, PUT, OPTIONS', @@ -52,16 +103,14 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): 'charset': 'utf-8' } - - - logging.debug("Received query {}".format(query)) + logger.debug("Received query {}".format(query)) start = datetime.now() # Backward-compatability: we used to force query to be # a named argument. query = query.strip("query=") query = query.strip("queryTerms=") - + try: query = json.loads(query) query['ip'] = ip @@ -70,16 +119,26 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): status = '404' start_response(status, list(headers.items())) return [b'{"status":"error", "message": "You have passed invalid JSON to the Bookworm API"}'] - - process = SQLAPIcall(query) - response_body = process.execute() + + if query['method'] and query['method'] == "endpoints": + query['format'] = "json" + response_body = json.dumps({ + 'status': 'success', + 'data': duck_connections.options() + }) + else: + if args.cache == "none": + process = API(query=query, db=duck_connections[query['database']], **API_kwargs) + else: + process = Caching_API(query, query_cache, API, **API_kwargs) + response_body = process.execute() # It might be binary already. headers['Content-type'] = content_type(query) - + if headers['Content-type'] != 'application/octet-stream': response_body = bytes(response_body, 'utf-8') - + headers['Content-Length'] = str(len(response_body)) status = '200 OK' start_response(status, list(headers.items())) @@ -90,19 +149,20 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): with open(logfile, 'a') as fout: json.dump(query, fout) fout.write("\n") - logging.debug("Writing to log: \n{}\n".format(json.dumps(query))) + logger.debug("Writing to log: \n{}\n".format(json.dumps(query))) return [response_body] # Copied from the gunicorn docs. - def number_of_workers(): - return (multiprocessing.cpu_count() * 2) + 1 + return (multiprocessing.cpu_count()) + 1 class StandaloneApplication(gunicorn.app.base.BaseApplication): + """ Superclassed to allow bookworm to do the running. """ + def __init__(self, app, options=None): self.options = options or {} self.application = app @@ -117,14 +177,19 @@ def load_config(self): def load(self): return self.application -def run(port = 10012, workers = number_of_workers()): +def run(port = 10012, bind="0.0.0.0", workers = number_of_workers()): + """ + port: the service port + bind: the host to bind to. Requests that don't match this address + will be ignored. The default accepts all connections: 127.0.0.1 listens + only to localhost, for when you're hiding it behind nginx or apache or something. + """ if workers==0: workers = number_of_workers() - + options = { - 'bind': '{}:{}'.format('127.0.0.1', port), + 'bind': f'{bind}:{port}', 'workers': workers, } - + StandaloneApplication(application, options).run() - diff --git a/setup.py b/setup.py index 1aa0aa5..588f06d 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,6 @@ author_email="bmschmidt@gmail.com", license="MIT", classifiers=[ - 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Intended Audience :: Education', "Natural Language :: English", @@ -34,8 +33,9 @@ "Topic :: Text Processing :: Indexing", "Topic :: Text Processing :: Linguistic" ], - install_requires=["numpy","pandas","mysqlclient", + install_requires=["pandas","mysqlclient", + "duckdb", "python-dateutil", "psutil", "bounter", - "gunicorn" + "gunicorn", "regex", "pyarrow" ] ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/setup.py b/tests/setup.py deleted file mode 100644 index 03f9fd7..0000000 --- a/tests/setup.py +++ /dev/null @@ -1,92 +0,0 @@ -from __future__ import print_function -import bookwormDB -import bookwormDB.CreateDatabase -from bookwormDB.general_API import SQLAPIcall as SQLAPIcall -import logging -import os -from subprocess import call as call -import sys -import json -from shutil import rmtree - -def setup_bookworm(): - """ - Creates a test bookworm. Removes any existing databases called "federalist_bookworm" - """ - logging.info("\n\nTESTING BOOKWORM CREATION\n\n") - import MySQLdb - from warnings import filterwarnings - filterwarnings('ignore', category = MySQLdb.Warning) - - import bookwormDB.configuration - os.chdir(sys.path[0] + "/test_bookworm_files") - rmtree(".bookworm", ignore_errors = True) - - bookwormDB.configuration.create(ask_about_defaults=False, database="federalist_bookworm") - - db = bookwormDB.CreateDatabase.DB(dbname="mysql") - - try: - db.query("DROP DATABASE IF EXISTS federalist_bookworm") - except MySQLdb.OperationalError as e: - if e[0]==1008: - pass - else: - print(e) - raise - except Exception as e: - """ - This is some weird MariaDB exception. It sucks that I'm compensating for it here. - """ - if e[0]=="Cannot load from mysql.proc. The table is probably corrupted": - pass - else: - print(e) - logging.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") - - call(["bookworm --log-level warning build all"],shell=True,cwd=sys.path[0] + "/test_bookworm_files") - - -def setup_bookworm_unicode(): - """ - Creates a test bookworm. Removes any existing databases called "unicode_test_bookworm" - """ - logging.info("\n\nTESTING BOOKWORM CREATION\n\n") - import MySQLdb - from warnings import filterwarnings - filterwarnings('ignore', category = MySQLdb.Warning) - - import bookwormDB.configuration - os.chdir(sys.path[0] + "/test_bookworm_files_unicode") - rmtree(".bookworm", ignore_errors = True) - - bookwormDB.configuration.create(ask_about_defaults=False,database="unicode_test_bookworm") - - db = bookwormDB.CreateDatabase.DB(dbname="mysql") - - try: - db.query("DROP DATABASE IF EXISTS unicode_test_bookworm") - except MySQLdb.OperationalError as e: - if e[0]==1008: - pass - else: - print(e) - raise - except Exception as e: - """ - This is some weird MariaDB exception. It sucks that I'm compensating for it here. - """ - if e[0]=="Cannot load from mysql.proc. The table is probably corrupted": - pass - else: - logging.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") - - call(["bookworm --log-level warning build all"], - shell=True, - cwd=sys.path[0] + "/test_bookworm_files_unicode") - - -if __name__=="__main__": - setup_bookworm() - setup_bookworm_unicode() - diff --git a/tests/test_API.py b/tests/test_API.py index 4fab13a..1a728c2 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -1,50 +1,80 @@ # -*- coding: utf-8 -*- -from builtins import range -from builtins import object -import unittest +import pytest import bookwormDB -import bookwormDB.CreateDatabase -from bookwormDB.general_API import SQLAPIcall as SQLAPIcall +from bookwormDB.general_API import DuckDBCall as DuckDBCall +from bookwormDB.builder import BookwormCorpus +from pathlib import Path import logging import os +import duckdb from subprocess import call as call import sys import json -from setup import setup_bookworm, setup_bookworm_unicode - -class Bookworm_SQL_Creation(unittest.TestCase): - - def test_bookworm_files_exist(self): - bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase("federalist_bookworm") - db = bookworm.db - db.query("USE federalist_bookworm") - wordCount = db.query("SELECT SUM(nwords) FROM fastcat_").fetchall()[0][0] +import pytest + +@pytest.fixture(scope="session") +def federalist_bookworm(tmpdir_factory): + path = tmpdir_factory.mktemp("ascii").join("federalist.duckdb") + tmpdir = tmpdir_factory.mktemp("tmpdir") + corp = BookwormCorpus( + f"{path}", + ngrams = 2, + texts = Path('tests/test_bookworm_files/input.txt'), + metadata = "tests/test_bookworm_files/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + con = duckdb.connect(str(path), read_only = True) + return con + +@pytest.fixture(scope="session") +def unicode_bookworm(tmpdir_factory): + path = tmpdir_factory.mktemp("unicode").join("unicode.duckdb") + tmpdir = tmpdir_factory.mktemp("tmpdir") + corp = BookwormCorpus( + f"{path}", + ngrams = 1, + texts = Path('tests/test_bookworm_files_unicode/input.txt'), + metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + con = duckdb.connect(str(path), read_only = True) + return con + +class Test_Bookworm_SQL_Creation(): + def test_nwords_populated(self, federalist_bookworm): + wordCount = federalist_bookworm.query('SELECT SUM(nwords) FROM fastcat').fetchall()[0][0] + # This should be about 212,081, but I don't want the tests to start failing when + # we change the tokenization rules or miscellaneous things about encoding. + assert wordCount > 200000 + """ + Then we test whether the API can make queries on that bookworm. + """ + + def test_fastcat_populated(self, federalist_bookworm): + textCount = federalist_bookworm.query('SELECT COUNT(*) FROM fastcat').fetchall()[0][0] # This should be 212,081, but I don't want the tests to start failing when # we change the tokenization rules or miscellaneous things about encoding. - self.assertTrue(wordCount>100000) + assert textCount == 1333 """ Then we test whether the API can make queries on that bookworm. """ - def test_API(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall - import json + def test_groups(self, federalist_bookworm): query = { "database":"federalist_bookworm", "search_limits":{}, "counttype":"TextPercent", "groups":["author"], - "method":"data", "format":"json" + "method":"data", + "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertEqual(len(m),5) + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m) == 5 - - def test_multiword_search(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_multiword_search(self, federalist_bookworm): import json query = { @@ -55,11 +85,10 @@ def test_multiword_search(self): "groups": [] } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(m[0] > 33) + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert m[0]["TextPercent"] > 33 - def test_ne_with_one_entry(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_ne_with_one_entry(self, federalist_bookworm): import json query = { @@ -72,11 +101,10 @@ def test_ne_with_one_entry(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==4) + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m)==4 - def test_ne_with_two_entries(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_ne_with_two_entries(self, federalist_bookworm): import json query = { @@ -89,12 +117,11 @@ def test_ne_with_two_entries(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==3) + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m)==3 - def test_ne_with_two_entries(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_ne_with_two_entries(self, federalist_bookworm): import json query = { @@ -107,12 +134,11 @@ def test_ne_with_two_entries(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==3) + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m)==3 - def test_or_with_two_entries(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_or_with_two_entries(self, federalist_bookworm): import json query = { @@ -128,11 +154,10 @@ def test_or_with_two_entries(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertEqual(len(m),2) + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m) == 2 - def test_lte_and_gte(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_lte_and_gte(self, federalist_bookworm): import json query = { @@ -145,11 +170,10 @@ def test_lte_and_gte(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==6) + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m)==6 - def test_and_with_two_entries(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_and_with_two_entries(self, federalist_bookworm): import json query = { @@ -165,10 +189,10 @@ def test_and_with_two_entries(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==0) + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m)==0 - def test_adding_metadata_to_bookworm(self): + def ftest_adding_metadata_to_bookworm(self): """ Build out some dummy metadata: label the difference between even and odd paragrahs. @@ -217,8 +241,7 @@ class Dummy(object): "method":"data", "format":"json" } - SQLAPIcall(query) - m = json.loads(SQLAPIcall(query).execute())['data'] +# m = json.loads(SQLAPIcall(query).execute())['data'] # Even or odd is one of two things. self.assertTrue(len(m)==2) @@ -227,7 +250,7 @@ class Dummy(object): self.assertTrue(m['odd'][0]>=m['even'][0]) - def test_case_sensitivity(self): + def test_case_sensitivity(self, federalist_bookworm): query = { "database":"federalist_bookworm", "search_limits":{"word":["the"]}, @@ -237,19 +260,17 @@ def test_case_sensitivity(self): "method":"data", "format":"json" } - SQLAPIcall(query) - val1 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] > 0) + val1 = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert(val1[0]["WordCount"] > 0) query["words_collation"] = "Case_Insensitive" - SQLAPIcall(query) - val2 = json.loads(SQLAPIcall(query).execute())['data'] + val2= json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] # The words ('The','the') appear more often than ('the') alone. - self.assertTrue(val2[0] > val1[0]) + assert (val2[0]["WordCount"] > val1[0]["WordCount"]) - def test_case_insensitivity_works_without_search_term(self): + def test_case_insensitivity_works_without_search_term_existing(self, federalist_bookworm): query = { "database":"federalist_bookworm", "search_limits":{"word":["hOwEvEr"]}, @@ -258,41 +279,46 @@ def test_case_insensitivity_works_without_search_term(self): "words_collation":"Case_Insensitive", "method":"data", "format":"json" } - SQLAPIcall(query) - val1 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] > 0) + val = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert (val[0]["WordCount"] > 0) - def test_unicode_search_term(self): + def test_unicode_search_term(self, unicode_bookworm): query = { "database":"unicode_test_bookworm", - "search_limits":{"word":[u"ᎾᏍᎩ"]}, + "search_limits":{"word":["ᎾᏍᎩ"]}, "counttype":"WordCount", "groups":[], "words_collation":"Case_Insensitive", "method":"data", "format":"json" } - SQLAPIcall(query) - val1 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] > 0) + val = json.loads(DuckDBCall(db = unicode_bookworm, query = query).execute())['data'] + assert (val[0]["WordCount"] > 0) - def test_various_unicode_cases(self): + def test_various_unicode_cases(self, unicode_bookworm): # There's a 'description_' for each individual item. - catalog_location = sys.path[0] + "/test_bookworm_files_unicode/jsoncatalog.txt" - cases = [json.loads(line)["description_"] for line in open(catalog_location)] + catalog_location = "tests/test_bookworm_files_unicode/jsoncatalog.txt" + cases = [json.loads(line)["description_"] for line in open(catalog_location)] + wordcounts = unicode_bookworm.query("SELECT * FROM nwords").df()['nwords'] + fastcounts = unicode_bookworm.query("SELECT * FROM fastcat").df()['nwords'] + assert (wordcounts > 0).all() + assert (fastcounts > 0).all() for case in cases: query = { "database":"unicode_test_bookworm", - "search_limits":{"description_":case}, - "counttype":"WordCount", - "groups":[], - "words_collation":"Case_Insensitive", - "method":"data", "format":"json" + "search_limits": {"description_": case}, + "counttype": "WordCount", + "groups": [], + "words_collation": "Case_Insensitive", + "method": "data", "format": "json" } - SQLAPIcall(query) - val1 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] > 0) - - def test_asterisks_in_search_limits(self): + try: + val = json.loads(DuckDBCall(db = unicode_bookworm, query = query).execute())['data'] + except KeyError: + print(DuckDBCall(db = unicode_bookworm, query = query).execute()) + raise + assert(val[0]["WordCount"] > 0) + + def test_asterisks_in_search_limits(self, federalist_bookworm): """ The following two queries should, by definition, produce the same result. """ @@ -303,8 +329,9 @@ def test_asterisks_in_search_limits(self): "counttype":"WordsPerMillion", "groups":[], "method":"data", "format":"json" - } - val1 = json.loads(SQLAPIcall(query).execute())['data'] + } + + val1 = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] query = { "database":"federalist_bookworm", @@ -313,46 +340,7 @@ def test_asterisks_in_search_limits(self): "groups":[], "method":"data", "format":"json" } - val2 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] == val2[0]) - - -""" -class SQLConnections(unittest.TestCase): - - - - def test_dunning(self): - query = { - "database":"federalist", - "search_limits":{"author":"Hamilton"}, - "compare_limits":{"author":"Madison"}, - "counttype":"Dunning", - "groups":["unigram"], - "method":"data", "format":"json" - } - + val2 = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert(val1[0]["WordsPerMillion"] == val2[0]["WordsPerMillion"]) - try: - #dbbindings.main(query) - worked = True - except: - worked = False - - self.assertTrue(worked) -""" - - -if __name__=="__main__": - # The setup is done without verbose logging; any failure - # causes it to try again. - logging.basicConfig(level=40) - try: - setup_bookworm() - setup_bookworm_unicode() - except: - logging.basicConfig(level=10) - setup_bookworm() - setup_bookworm_unicode() - logging.basicConfig(level=10) - unittest.main() + \ No newline at end of file diff --git a/tests/test_config.py b/tests/test_config.py deleted file mode 100644 index 69d853d..0000000 --- a/tests/test_config.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -from bookwormDB.manager import BookwormManager -import unittest -import logging -import os -import sys - -class Bookworm_Configuration(unittest.TestCase): - - def test_config(self): - bookworm = BookwormManager(None, "federalist_bookworm") - - -if __name__=="__main__": - # The setup is done without verbose logging; any failure - # causes it to try again. - unittest.main() diff --git a/tests/test_creation.py b/tests/test_creation.py new file mode 100644 index 0000000..4162a19 --- /dev/null +++ b/tests/test_creation.py @@ -0,0 +1,36 @@ +import pytest +from pathlib import Path +from bookwormDB.builder import BookwormCorpus +import duckdb + +class TestCreation(): + def test_ascii_creation(self, tmpdir): + path = Path(f"{tmpdir}/federalist.duckdb") + + corp = BookwormCorpus( + path, + ngrams = 2, + texts = Path('tests/test_bookworm_files/input.txt'), + metadata = "tests/test_bookworm_files/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + con = duckdb.connect(str(path)) + ts = con.execute("""SELECT sum(nwords) as 'WordCount' FROM "fastcat" """).fetchall()[0][0] + assert ts > 20 + + def test_unicode_creation(self, tmpdir): + path = Path(f"{tmpdir}/unicode.duckdb") + if path.exists(): path.unlink() + corp = BookwormCorpus( + path, + ngrams = 2, + texts = Path('tests/test_bookworm_files_unicode/input.txt'), + metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + con = duckdb.connect(str(path)) + # There's a 'description_' for each individual item. + ts = con.execute("""SELECT sum(nwords) as 'WordCount' + FROM "slowcat" NATURAL JOIN "fastcat" """).fetchall()[0][0] + assert ts > 20 + \ No newline at end of file diff --git a/tests/test_mysql.py b/tests/test_mysql.py deleted file mode 100644 index ffd0d62..0000000 --- a/tests/test_mysql.py +++ /dev/null @@ -1,63 +0,0 @@ -from builtins import hex -import unittest -import bookwormDB -from bookwormDB.configuration import Configfile -import bookwormDB.CreateDatabase -import logging -import MySQLdb -import random - -logging.basicConfig(level=10) - - -""" -Tests of the MySQL configuration. -""" - -class Bookworm_MySQL_Configuration(unittest.TestCase): - def test_server_connection(self): - logging.info("\n\nTESTING SERVER CONNECTION\n\n") - """ - Connect to MySQL and run a simple query. - """ - import bookwormDB.CreateDatabase - db = bookwormDB.CreateDatabase.DB(dbname="mysql") - sampleQuery=db.query("SELECT 1+1").fetchall() - self.assertTrue(sampleQuery[0][0]==2) - - """ - To properly test things, we actually build some bookworms. - This assumes that the directory '/tmp' is writeable, - which isn't strictly necessary for a bookworm to be built. - """ - - def test_config_files(self): - logging.info("\n\nTESTING CONFIG FILE ACCESS\n\n") - def test_config_file(conf): - user = conf.config.get("client","user") - pw = conf.config.get("client","password") - return (user,pw) - - global_configuration_file = Configfile("read_only") - admin_configuration_file = Configfile("admin") - - (admin_user,admin_pw) = test_config_file(global_configuration_file) - (client_user,client_pw) = test_config_file(admin_configuration_file) - logging.info("admin user is {} and password is {}".format(admin_user,admin_pw)) - logging.info("client user is {} and password is {}".format(client_user,client_pw)) - logging.info("Checking that admin and client users are distinct") - self.assertTrue(admin_user != client_user) - - def test_createDB_permission(self): - logging.info("\nTESTING ABILITY TO CREATE DATABASES\n\n") - import bookwormDB.configuration - dbname = "A" + hex(random.getrandbits(128))[2:-1] - import bookwormDB.CreateDatabase - db = bookwormDB.CreateDatabase.DB(dbname="mysql") - cursor = db.query("CREATE DATABASE {}".format(dbname)) - cursor.execute("DROP DATABASE {}".format(dbname)) - cursor.close() - - -if __name__=="__main__": - unittest.main() diff --git a/tests/test_sql_construction.py b/tests/test_sql_construction.py new file mode 100644 index 0000000..0f4cfd9 --- /dev/null +++ b/tests/test_sql_construction.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +import pytest +import bookwormDB +from bookwormDB.general_API import DuckDBCall as DuckDBCall +from bookwormDB.builder import BookwormCorpus +from pathlib import Path +import logging +logger = logging.getLogger("bookworm") + +import os +import duckdb +from subprocess import call as call +import sys +import json +import pytest + +@pytest.fixture(scope="session") +def federalist_bookworm(tmpdir_factory): + path = tmpdir_factory.mktemp("ascii").join("federalist.duckdb") + tmpdir = tmpdir_factory.mktemp("tmpdir") + corp = BookwormCorpus( + f"{path}", + ngrams = 2, + texts = Path('tests/test_bookworm_files/input.txt'), + metadata = "tests/test_bookworm_files/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + con = duckdb.connect(str(path), read_only = True) + return con + +@pytest.fixture(scope="session") +def unicode_bookworm(tmpdir_factory): + path = tmpdir_factory.mktemp("unicode").join("unicode.duckdb") + tmpdir = tmpdir_factory.mktemp("tmpdir") + + corp = BookwormCorpus( + f"{path}", + ngrams = 2, + texts = Path('tests/test_bookworm_files_unicode/input.txt'), + metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + + corp.build() + con = duckdb.connect(str(path), read_only = True) + return con + +class Test_Bookworm_SQL_Creation(): + + def test_ne_with_one_entry(self, federalist_bookworm): + import json + + query = { + "database":"federalist_bookworm", + "search_limits":{ + "author": {"$ne": ["HAMILTON"]} + }, + "counttype":"TextPercent", + "groups":["author"], + "method":"data", "format":"json" + } + + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m)==4 \ No newline at end of file diff --git a/tests/unimplemented_est_formats.py b/tests/unimplemented_est_formats.py new file mode 100644 index 0000000..ccab3a0 --- /dev/null +++ b/tests/unimplemented_est_formats.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +from builtins import range +from builtins import object +import pytest +import bookwormDB +import logging +logger = logging.getLogger("bookworm") + +import os +from subprocess import call as call +import sys +import json +from setup import setup_bookworm, setup_bookworm_unicode +from pyarrow import feather +import io + +class TestFormats: + + def test_feather(self): + from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + query = { + "database":"federalist_bookworm", + "search_limits":{}, + "counttype":"TextPercent", + "groups":["author"], + "method":"data", + "format":"feather" + } + + feather_file = SQLAPIcall(query).execute() + f = io.BytesIO(feather_file) + f.seek(0) + m = feather.read_feather(f) + self.assertEqual(m.shape[0],5) + self.assertEqual(m.shape[1],2) + + + def test_proxy_API(self): + from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + + import json + + query = { + "database":"federalist_bookworm", + "search_limits":{}, + "counttype":"TextPercent", + "groups":["author"], + "method":"data", + "format":"json" + } + + m = json.loads(SQLAPIcall(query).execute())['data'] + self.assertEqual(len(m),5) + \ No newline at end of file