diff --git a/queryParse.py b/queryParse.py new file mode 100644 index 0000000..830801d --- /dev/null +++ b/queryParse.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +import re +import sqlparse + +def parseSQL(SQL): + ''' + Preprocessing the SQL in order to match the operator using re + + Operator ReplaceCode + > op0 + < op1 + == op2 + >= op3 + <= op4 + <> op5 + ''' + SQL = SQL.replace('>=','op3') + SQL = SQL.replace('<=','op4') + SQL = SQL.replace('<>','op5') + SQL = SQL.replace('>','op0') + SQL = SQL.replace('<','op1') + SQL = SQL.replace('==','op2') + ''' + Operator ReplaceCode + + op6 + - op7 + * op8 + / op9 + ''' + SQL = SQL.replace('+','op6') + SQL = SQL.replace('-','op7') + SQL = SQL.replace('*','op8') + SQL = SQL.replace('/','op9') + # print(SQL) + ''' + Step1. Split the SQL into a list + This module split the SQL statement into key-value + For example: "SELECT id FROM tableA WHERE loan>100000" will parse and split as: + [["SELECT": id], + ["FROM": tableA], + ["WHERE": loan>100000]] + ''' + parsed = sqlparse.parse(SQL) + stmt = parsed[0] + parsed_list = [] + for token in stmt.tokens: + if token.value != ' ': + if token.value.upper().__contains__("WHERE"): + parsed_list.append("WHERE") + parsed_list.append(token.value[6:]) + else: + parsed_list.append(token.value) + # print("Successfully execute step1:") + # print(parsed_list) + + ''' + Step2. Combine each part to key-value + + Note: this module just for test, only support simple SELECT SQL + ''' + SELECT_KEYWORDS = ['SELECT', 'FROM', 'WHERE', 'ORDER BY'] + INSERT_KEYWORDS = [] + UPDATE_KEYWORDS = [] + DELETE_KEYWORDS = [] + + SQL_dict = {} + for index in range(len(parsed_list)): + if parsed_list[index].upper() in SELECT_KEYWORDS: + SQL_dict[parsed_list[index].upper()] = parsed_list[index + 1] + index += 1 + # print("Successfully execute step2:") + # print(SQL_dict) + + ''' + Step3. substract the token.value into operator buffer + + For example: if the select condition is "loan>50000 AND loan<1000000", this condition will parse as: + ['loan>50000', 'AND', 'loan<1000000'] + + Above are three subconditions, and we'll substract the subconditions as: + [['loan', '>', '50000'], ['AND'], ['loan', '<', '1000000']] + + Note: this module just for test, only support single table SELECT SQL + ''' + + pattern_conj = r'(\s+AND\s+|\s+OR\s+|\s*,\s*|\s*\(\s*|\s*\)\s*)' + pattern_op = r'(\s*op1\s*|\s*op2\s*|\s*op3\s*|\s*op4\s*|\s*op5\s*|\s*op6\s*|\s*op7\s*|\s*op8\s*|\s*op9\s*|\s*op0\s*|)' + for key in SQL_dict: + SQL_dict[key] = re.split(pattern_conj, SQL_dict[key]) + value = [] + for tuple in SQL_dict[key]: + tuple = re.split(pattern_op, tuple) + if tuple != ['']: + value.append(tuple) + SQL_dict[key] = value + + # print("Successfully execute step3:") + # for key in SQL_dict: + # print("keywords: " + key) + # print(SQL_dict[key]) + + return SQL_dict + + +if __name__ == '__main__': + res = parseSQL("SELECT ID,AVG(DEPOSIT),MAX(LOAN1) FROM TABLE") + print(res) \ No newline at end of file