大数据之superset

从Python的序列创建DBF文件,itertools,使用docker进行安装,docker安装完成后还需要安装docker-compose,对目录下保留最后的zip文件,表示对c,复制代码 代码如下,对目录下保留最后的zip文件

金沙澳门官网7817网址 1

读取遍历dBASE或Xbase文件中的记录。从Python的行列创立DBF文件。“`python
import struct, datetime, decimal, itertools

1、概述

 

python落成的文书夹清理程序分享,

使用:

复制代码 代码如下:

foldercleanup.py -d 10 -k c:\test\keepfile.txt c:\test

表示对c:\test目录只保留如今10天的子文件夹和keepfile.txt中钦点的子文件夹。

代码:

复制代码 代码如下:

import os
import os.path
import datetime
 
def getOption():
  from optparse import OptionParser
 
  des   = “clean up the folder with some options”
  prog  = “foldercleanup”
  ver   = “%prog 0.0.1”
  usage = “%prog [options] foldername”
 
  p = OptionParser(description=des, prog=prog, version=ver,
usage=usage,add_help_option=True)
 
p.add_option(‘-d’,’–days’,action=’store’,type=’string’,dest=’days’,help=”keep
the subfolders which are created in recent %days% days”)
 
p.add_option(‘-k’,’–keepfile’,action=’store’,type=’string’,dest=’keepfile’,help=”keep
the subfolders which are recorded in text file %keepfile% “)
  options, arguments = p.parse_args()
 
  if len(arguments) != 1:
    print(“error: must input one directory as only one parameter “)
    return
 
  return options.days, options.keepfile, arguments[0] 

 
def preCheckDir(dir):
  if(not os.path.exists(dir)):
    print(“error: the directory your input is not existed”)
    return
  if(not os.path.isdir(dir)):
    print (“error: the parameter your input is not a directory”)
    return
   
  return os.path.abspath(dir)
 
def isKeepByDay(dir, day):
  indays = False
  if( day is not None) :
    t = os.path.getctime(dir)
    today = datetime.date.today()
    createdate = datetime.date.fromtimestamp(t)
    indate = today – datetime.timedelta(days = int(day))
    print (createdate)
    if(createdate >= indate):
      indays = True
  print (indays)
  return indays
 
def isKeepByKeepfile(dir, keepfile):
  needkeep = False
  print (dir)
  if (keepfile is not None):
    try :
      kf = open(keepfile,”r”)
      for f in kf.readlines():
        print (f)
        if (dir.upper().endswith(“\\” + f.strip().upper())):
          needkeep = True
      kf.close()
    except:
      print (“error: keep file cannot be opened”)
  print(needkeep)
  return needkeep
   
def removeSubFolders(dir, day, keepfile):
  subdirs = os.listdir(dir)
  for subdir in subdirs:
    subdir = os.path.join(dir,subdir)
    if ( not os.path.isdir(subdir)):
      continue
    print(“———————-“)
    if( (not isKeepByDay(subdir, day))and (not isKeepByKeepfile(subdir,
keepfile))):
      print(“remove subfolder: ” + subdir)
      import shutil
      shutil.rmtree(subdir,True)
   
def FolderCleanUp():
  (day, keepfile, dir) = getOption()
  dir = preCheckDir(dir)
  if dir is None:
    return
  removeSubFolders(dir,day,keepfile)
 
if __name__==’__main__’:
  FolderCleanUp()

对目录下封存最终的zip文件:

复制代码 代码如下:

def KeepLastNumZips(num)
    def extractTime(f):
        return os.path.getctime(f)

    zipfiles = [os.path.join(zipdir, f)
                for f in os.listdir(zipdir)
                if os.path.splitext(f)[1] == “.zip”]
    if len(zipfiles) > num:
        zipfiles.sort(key=extractTime, reverse=True)
        for i in range(num, len(zipfiles)):
            os.remove(zipfiles[i])

使用: 复制代码
代码如下: foldercleanup.py -d 10 -k c:\test\keepfile.txt c:\test
表示对c:\test目录只保留近期10天的…

def dbfreader(f): “””Returns an iterator over records in a Xbase DBF
file.

superset大额可视化的利器,深度集成durid,结合kylin、presto达成壮大的大数据可视化功效,曾用名Panoramix、caravel。比较caravel它有个相比较抢眼的效果SQL 
lab。具体可参谋官方文书档案

使用:

The first row returned contains the field names.
The second row contains field specs: (type, size, decimal places).
Subsequent rows contain the data records.
If a record is marked as deleted, it is skipped.

File should be opened for binary reads.

"""
# See DBF format spec at:
#     http://www.pgts.com.au/download/public/xbase.htm#DBF_STRUCT

numrec, lenheader = struct.unpack('<xxxxLH22x', f.read(32))    
numfields = (lenheader - 33) // 32

fields = []
for fieldno in xrange(numfields):
    name, typ, size, deci = struct.unpack('<11sc4xBB14x', f.read(32))
    name = name.replace('\0', '')       # eliminate NULs from string   
    fields.append((name, typ, size, deci))
yield [field[0] for field in fields]
yield [tuple(field[1:]) for field in fields]

terminator = f.read(1)
assert terminator == '\r'

fields.insert(0, ('DeletionFlag', 'C', 1, 0))
fmt = ''.join(['%ds' % fieldinfo[2] for fieldinfo in fields])
fmtsiz = struct.calcsize(fmt)
for i in xrange(numrec):
    record = struct.unpack(fmt, f.read(fmtsiz))
    if record[0] != ' ':
        continue                        # deleted record
    result = []
    for (name, typ, size, deci), value in itertools.izip(fields, record):
        if name == 'DeletionFlag':
            continue
        if typ == "N":
            value = value.replace('\0', '').lstrip()
            if value == '':
                value = 0
            elif deci:
                value = decimal.Decimal(value)
            else:
                value = int(value)
        elif typ == 'D':
            y, m, d = int(value[:4]), int(value[4:6]), int(value[6:8])
            value = datetime.date(y, m, d)
        elif typ == 'L':
            value = (value in 'YyTt' and 'T') or (value in 'NnFf' and 'F') or '?'
        elif typ == 'F':
            value = float(value)
        result.append(value)
    yield result

 

foldercleanup.py -d 10 -k c:\test\keepfile.txt c:\test

def dbfwriter(f, fieldnames, fieldspecs, records): “”” Return a string
suitable for writing directly to a binary dbf file.

2、安装

表示对c:\test目录只保留近日拾天的子文件夹和keepfile.txt中内定的子文件夹。

File f should be open for writing in a binary mode.

Fieldnames should be no longer than ten characters and not include \x00.
Fieldspecs are in the form (type, size, deci) where
    type is one of:
        C for ascii character data
        M for ascii character memo data (real memo fields not supported)
        D for datetime objects
        N for ints or decimal objects
        L for logical values 'T', 'F', or '?'
    size is the field width
    deci is the number of decimal places in the provided decimal object
Records can be an iterable over the records (sequences of field values).

"""
# header info
ver = 3
now = datetime.datetime.now()
yr, mon, day = now.year-1900, now.month, now.day
numrec = len(records)
numfields = len(fieldspecs)
lenheader = numfields * 32 + 33
lenrecord = sum(field[1] for field in fieldspecs) + 1
hdr = struct.pack('<BBBBLHH20x', ver, yr, mon, day, numrec, lenheader, lenrecord)
f.write(hdr)

# field specs
for name, (typ, size, deci) in itertools.izip(fieldnames, fieldspecs):
    name = name.ljust(11, '\x00')
    fld = struct.pack('<11sc4xBB14x', name, typ, size, deci)
    f.write(fld)

# terminator
f.write('\r')

# records
for record in records:
    f.write(' ')                        # deletion flag
    for (typ, size, deci), value in itertools.izip(fieldspecs, record):
        if typ == "N":
            value = str(value).rjust(size, ' ')
        elif typ == 'D':
            value = value.strftime('%Y%m%d')
        elif typ == 'L':
            value = str(value)[0].upper()
        else:
            value = str(value)[:size].ljust(size, ' ')
        assert len(value) == size
        f.write(value)

# End of file
f.write('\x1A')

利用docker进行安装,首先要先安装docker,选用的是centos柒一向使用yum安装就能够。docker安装实现后还亟需安装docker-compose

 


#yum -y install docker docker-compose

代码:  

=======================================================

#yum -y install git

import os
import os.path
import datetime
  
def getOption():
  from optparse import OptionParser
  
  des   = “clean up the folder with some options”
  prog  = “foldercleanup”
  ver   = “%prog 0.0.1”
  usage = “%prog [options] foldername”
  
  p = OptionParser(description=des, prog=prog, version=ver, usage=usage,add_help_option=True)
  p.add_option(‘-d’,’–days’,action=’store’,type=’string’,dest=’days’,help=”keep the subfolders which are created in recent %days% days”)
  p.add_option(‘-k’,’–keepfile’,action=’store’,type=’string’,dest=’keepfile’,help=”keep the subfolders which are recorded in text file %keepfile% “)
  options, arguments = p.parse_args()
  
  if len(arguments) != 1:
    print(“error: must input one directory as only one parameter “)
    return
  
  return options.days, options.keepfile, arguments[0]  

Example calls

if name == ‘main‘: import sys, csv from cStringIO import
StringIO from operator import itemgetter

# Read a database
filename = '/pydev/databases/orders.dbf'      
if len(sys.argv) == 2:
    filename = sys.argv[1]
f = open(filename, 'rb')
db = list(dbfreader(f))
f.close()
for record in db:
    print record
fieldnames, fieldspecs, records = db[0], db[1], db[2:]

# Alter the database
del records[4]
records.sort(key=itemgetter(4))

# Remove a field
del fieldnames[0]
del fieldspecs[0]
records = [rec[1:] for rec in records]

# Create a new DBF
f = StringIO()
dbfwriter(f, fieldnames, fieldspecs, records)

# Read the data back from the new DBF
print '-' * 20    
f.seek(0)
for line in dbfreader(f):
    print line
f.close()

# Convert to CSV
print '.' * 20    
f = StringIO()
csv.writer(f).writerow(fieldnames)    
csv.writer(f).writerows(records)
print f.getvalue()
f.close()

找到最新的superset的docker:
clone到服务器上。

 
def preCheckDir(dir):
  if(not os.path.exists(dir)):
    print(“error: the directory your input is not existed”)
    return
  if(not os.path.isdir(dir)):
    print (“error: the parameter your input is not a directory”)
    return
    
  return os.path.abspath(dir)
  
def isKeepByDay(dir, day):
  indays = False
  if( day is not None) :
    t = os.path.getctime(dir)
    today = datetime.date.today()
    createdate = datetime.date.fromtimestamp(t)
    indate = today – datetime.timedelta(days = int(day))
    print (createdate)
    if(createdate >= indate):
      indays = True
  print (indays)
  return indays
  
def isKeepByKeepfile(dir, keepfile):
  needkeep = False
  print (dir)
  if (keepfile is not None):
    try :
      kf = open(keepfile,”r”)
      for f in kf.readlines():
        print (f)
        if (dir.upper().endswith(“\\” + f.strip().upper())):
          needkeep = True
      kf.close()
    except:
      print (“error: keep file cannot be opened”)
  print(needkeep)
  return needkeep
    
def removeSubFolders(dir, day, keepfile):
  subdirs = os.listdir(dir)
  for subdir in subdirs:
    subdir = os.path.join(dir,subdir)
    if ( not os.path.isdir(subdir)):
      continue
    print(“———————-“)
    if( (not isKeepByDay(subdir, day))and (not isKeepByKeepfile(subdir, keepfile))):
      print(“remove subfolder: ” + subdir)
      import shutil
      shutil.rmtree(subdir,True)
    
def FolderCleanUp():
  (day, keepfile, dir) = getOption()
  dir = preCheckDir(dir)
  if dir is None:
    return
  removeSubFolders(dir,day,keepfile)
  
if __name__==’__main__’:
  FolderCleanUp()

Example Output

“”” [‘ORDER_ID’, ‘CUSTMR_ID’, ‘EMPLOY_ID’, ‘ORDER_DATE’,
‘ORDER_AMT’] [(‘C’, 10, 0), (‘C’, 11, 0), (‘C’, 11, 0), (‘D’, 8, 0),
(‘N’, 12, 2)] [‘10005 ‘, ‘WALNG ‘, ‘555 ‘, datetime.date(1995, 5, 22),
Decimal(“173.40”)] [‘10004 ‘, ‘BMARK ‘, ‘777 ‘, datetime.date(1995, 5,
18), Decimal(“3194.20”)] [‘10029 ‘, ‘SAWYH ‘, ‘777 ‘,
datetime.date(1995, 6, 29), Decimal(“97.30”)] [‘10013 ‘, ‘RITEB ‘,
‘777 ‘, datetime.date(1995, 6, 2), Decimal(“560.40”)] [‘10024 ‘,
‘RATTC ‘, ‘444 ‘, datetime.date(1995, 6, 21), Decimal(“2223.50”)]
[‘10018 ‘, ‘RATTC ‘, ‘444 ‘, datetime.date(1995, 6, 12),
Decimal(“1076.05”)] [‘10025 ‘, ‘RATTC ‘, ‘444 ‘, datetime.date(1995,
6, 23), Decimal(“185.80”)] [‘10038 ‘, ‘OLDWO ‘, ‘111 ‘,
datetime.date(1995, 7, 14), Decimal(“863.96”)] [‘10002 ‘, ‘MTIME ‘,
‘333 ‘, datetime.date(1995, 5, 16), Decimal(“731.80”)] [‘10007 ‘,
‘MORNS ‘, ‘444 ‘, datetime.date(1995, 5, 24), Decimal(“1405.00”)]
[‘10026 ‘, ‘MORNS ‘, ‘555 ‘, datetime.date(1995, 6, 26),
Decimal(“17.40”)] [‘10030 ‘, ‘LILLO ‘, ‘111 ‘, datetime.date(1995, 7,
3), Decimal(“909.91”)] [‘10022 ‘, ‘LAPLA ‘, ‘111 ‘,
datetime.date(1995, 6, 19), Decimal(“671.50”)] [‘10035 ‘, ‘HIGHG ‘,
‘111 ‘, datetime.date(1995, 7, 11), Decimal(“1984.83”)] [‘10033 ‘,
‘FOODG ‘, ‘333 ‘, datetime.date(1995, 7, 6), Decimal(“3401.32”)]


[‘CUSTMR_ID’, ‘EMPLOY_ID’, ‘ORDER_DATE’, ‘ORDER_AMT’] [(‘C’, 11,
0), (‘C’, 11, 0), (‘D’, 8, 0), (‘N’, 12, 2)] [‘MORNS ‘, ‘555 ‘,
datetime.date(1995, 6, 26), Decimal(“17.40”)] [‘SAWYH ‘, ‘777 ‘,
datetime.date(1995, 6, 29), Decimal(“97.30”)] [‘WALNG ‘, ‘555 ‘,
datetime.date(1995, 5, 22), Decimal(“173.40”)] [‘RATTC ‘, ‘444 ‘,
datetime.date(1995, 6, 23), Decimal(“185.80”)] [‘RITEB ‘, ‘777 ‘,
datetime.date(1995, 6, 2), Decimal(“560.40”)] [‘LAPLA ‘, ‘111 ‘,
datetime.date(1995, 6, 19), Decimal(“671.50”)] [‘MTIME ‘, ‘333 ‘,
datetime.date(1995, 5, 16), Decimal(“731.80”)] [‘OLDWO ‘, ‘111 ‘,
datetime.date(1995, 7, 14), Decimal(“863.96”)] [‘LILLO ‘, ‘111 ‘,
datetime.date(1995, 7, 3), Decimal(“909.91”)] [‘RATTC ‘, ‘444 ‘,
datetime.date(1995, 6, 12), Decimal(“1076.05”)] [‘MORNS ‘, ‘444 ‘,
datetime.date(1995, 5, 24), Decimal(“1405.00”)] [‘HIGHG ‘, ‘111 ‘,
datetime.date(1995, 7, 11), Decimal(“1984.83”)] [‘BMARK ‘, ‘777 ‘,
datetime.date(1995, 5, 18), Decimal(“3194.20”)] [‘FOODG ‘, ‘333 ‘,
datetime.date(1995, 7, 6), Decimal(“3401.32”)]金沙澳门官网7817网址, ………………..
CUSTMR_ID,EMPLOY_ID,ORDER_DATE,ORDER_AMT MORNS ,555
,1995-06-26,17.40 SAWYH ,777 ,1995-06-29,97.30 WALNG ,555
,1995-05-22,173.40 RATTC ,444 ,1995-06-23,185.80 RITEB ,777
,1995-06-02,560.40 LAPLA ,111 ,1995-06-19,671.50 MTIME ,333
,1995-05-16,731.80 OLDWO ,111 ,1995-07-14,863.96 LILLO ,111
,1995-07-03,909.91 RATTC ,444 ,1995-06-12,1076.05 MORNS ,444
,1995-05-24,1405.00 HIGHG ,111 ,1995-07-11,1984.83 BMARK ,777
,1995-05-18,3194.20 FOODG ,333 ,1995-07-06,3401.32 “”” “`

#cd /data

 

#git clone 

对目录下保存最终的zip文件:

修改docker-compose.yml文件

def KeepLastNumZips(num)
    def extractTime(f):
        return os.path.getctime(f)

#cat docker-compose.yml

    zipfiles = [os.path.join(zipdir, f)
                for f in os.listdir(zipdir)
                if os.path.splitext(f)[1] == “.zip”]
    if len(zipfiles) > num:
        zipfiles.sort(key=extractTime, reverse=True)
        for i in range(num, len(zipfiles)):
            os.remove(zipfiles[i])

version: ‘2’
services:
  superset:
    build:
      context: .
      args:
        SUPERSET_VERSION: 0.20.6
    image: amancevice/superset
    container_name: superset
    volumes:
      – /data/superset/hosts:/etc/hosts
      –
/data/superset/conf/superset_config.py:/etc/superset/superset_config.py
      –
/data/superset/conf/utils.py:/usr/local/lib/python3.5/dist-packages/superset/utils.py
    ports:
      – 8088:8088

 

注:必须求做hosts映射,因为要利用hive或presto填写地址必须采用主机名不可能接纳ip地址,由于pyhive0.伍不帮助ip。utils.py很要紧,首倘诺为着祛除sql
lab的timeout,

 

金沙澳门官网7817网址 1

 

将上述的signal注释掉,新加三个pass。这种办法把调整超时发送非复信号的代码注掉了,那样查询超越30s的时候就不会把进度kill掉。

完!

安排文件:

#cat superset_config.py

#———————————————————
# Superset specific config
#———————————————————
ROW_LIMIT = 5000
SUPERSET_WORKERS = 4
SUPERSET_WEBSERVER_TIMEOUT = 3000
SUPERSET_WEBSERVER_PORT = 8088
#———————————————————

#———————————————————
# Flask App Builder configuration
#———————————————————
# Your App secret key
SECRET_KEY = ‘\2\1thisismyscretkey\1\2\e\y\y\h’

# The SQLAlchemy connection string to your database backend
# This connection defines the path to the database that stores your
# superset metadata (slices, connections, tables, dashboards, …).
# Note that the connection information to connect to the datasources
# you want to explore are managed directly in the web UI
#SQLALCHEMY_DATABASE_URI = ‘sqlite:////data/superset.db’
SQLALCHEMY_DATABASE_URI = ‘sqlite:////home/superset/superset.db’

# Flask-WTF flag for CSRF
WTF_CSRF_ENABLED = True
# Add endpoints that need to be exempt from CSRF protection
WTF_CSRF_EXEMPT_LIST = []

# Set this API key to enable Mapbox visualizations
MAPBOX_API_KEY = ”

 

#cat utils.py

“””Utility functions used across Superset”””
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import decimal
import functools
import json
import logging
import os
import signal
import parsedatetime
import smtplib
import pytz
import sqlalchemy as sa
import uuid
import sys
import zlib
import numpy

from builtins import object
from datetime import date, datetime, time, timedelta

import celery
from dateutil.parser import parse
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
from email.utils import formatdate

from flask import flash, Markup, render_template, url_for, redirect,
request
from flask_appbuilder.const import (
    LOGMSG_ERR_SEC_ACCESS_DENIED,
    FLAMSG_ERR_SEC_ACCESS_DENIED,
    PERMISSION_PREFIX
)
from flask_appbuilder._compat import as_unicode
from flask_babel import gettext as __
from flask_cache import Cache
import markdown as md
from past.builtins import basestring
from pydruid.utils.having import Having
from sqlalchemy import event, exc, select
from sqlalchemy.types import TypeDecorator, TEXT

logging.getLogger(‘MARKDOWN’).setLevel(logging.INFO)

PY3K = sys.version_info >= (3, 0)
EPOCH = datetime(1970, 1, 1)
DTTM_ALIAS = ‘__timestamp’

class SupersetException(Exception):
    pass

class SupersetTimeoutException(SupersetException):
    pass