#!/usr/bin/env cctools_python
# CCTOOLS_PYTHON_VERSION 2.7 2.6

# Copyright (C) 2014- The University of Notre Dame
# This software is distributed under the GNU General Public License.
# See the file COPYING for details.
#
# This program generates makeflows to parallelize the
# popular blasr program.


import optparse, subprocess, os, sys, re, string, logging as log, stat


options = None
args = None

class PassThroughParser(optparse.OptionParser):
  def _process_args(self, largs, rargs, values):
    while rargs:
         try:
             optparse.OptionParser._process_args(self,largs,rargs,values)
         except (optparse.BadOptionError,optparse.AmbiguousOptionError), e:
             largs.append(e.opt_str)

def parse_blasr_options(parser):
	group = optparse.OptionGroup(parser, "Blasr Options")
	group.add_option('--ref', dest="ref", type="string", help='The reference input file')
	group.add_option('--query', dest="query", type="string", help='The query input file')
	group.add_option('--output', dest="output", type="string", help='the final output of blasr alignment')

	parser.add_option_group(group)

def parse_makeflow_options(parser):
	group = optparse.OptionGroup(parser, "Makeflow Preparation Options")
	group.add_option('--makeflow', dest='makeflow', type='string', help='Makeflow destination file [stdout]')
	group.add_option('--num_seq', help='determine the number of sequences per split', default=50000, type = int )
	group.add_option('--makeflow-config', dest='config', help='Makeflow configurations')
	group.add_option('--verbose', dest='verbose', help='Show verbose level output', action='store_true',default="False")
	parser.add_option_group(group)

#Helper function for finding executables in path
def search_file(filename, search_path, pathsep=os.pathsep):
    """ Given a search path, find file with requested name """
    for path in string.split(search_path, pathsep):
        candidate = os.path.join(path, filename)
        if os.path.exists(candidate): return os.path.abspath(candidate)
    return None


def count_splits(num_seq, query):
	log.info("Counting Number of Partitions")
	num_reads = num_seq
	num_splits = 0
	line_count=0

	FILE = open(query, "r")

	read_count = 0
	num_splits += 1
	for line in FILE:
		if (re.search('^[@]', line) and line_count % 4 ==0):
			if (read_count == num_reads):
				num_splits += 1
				read_count = 0
				if (num_splits % 10 == 0):
					log.info("Current Number of Partitions : {0}".format(num_splits))
			else:
				read_count += 1
		#place all other lines in FASTQ file under same sequence
		line_count += 1
	FILE.close()
	log.info("Total Number of Partitions : {0}".format(num_splits))
	return num_splits


def write_split_reduce(destination):
	global options
	split_name = open(destination, 'w')
	try:
		split_name.write('''
#!/usr/bin/perl
#
#Copyright (C) 2013- The University of Notre Dame
#This software is distributed under the GNU General Public License.
#See the file COPYING for details.
#
#Programmer: Brian Kachmarck
#Date: 7/28/2009
#
#Revised: Nick Hazekamp
#Date: 12/02/2013
#
#Purpose: Split a FASTQ file into smaller files determined by the number of sequences input

use strict;

my $numargs = $#ARGV + 1;
my $file =$ARGV[0];
my $num_reads=$ARGV[1];
my $num_splits = 0;
my $line_count= 0;


#Open input file
open(INPUT, $file);

my $read_count = 0;
open (OUTPUT,\">$num_splits.$file\");
$num_splits++;
while (my $line = <INPUT>) {
	chomp $line;
	#FASTQ files begin sequence with '@' character
	#If line begins with '@' then it is a new sequence and has 3 lines in between
	if ($line =~ /^[@]/ and $line_count % 4 ==0){
		#Check if the new sequence should be placed in a new file, otherwise place it in same file
		if ($read_count == $num_reads){
			close(OUTPUT);
			open(OUTPUT, \">$num_splits.$file\");
			print OUTPUT $line;
			print OUTPUT \"\\n\";
			$num_splits++;
			$read_count = 0;
		}
		else{
			print OUTPUT $line;
			print OUTPUT \"\\n\";
			$read_count++;
		}
	}
	#place all other lines in FASTQ file under same sequence
	else {
		print OUTPUT $line;
		print OUTPUT \"\\n\";
	}

	$line_count++;
}
print $num_splits;
close(INPUT);
''' )
	finally:
		split_name.close()



def write_makeflow(destination, configuration, num_reads_per_split, split_name):

	if destination:
		makeflow = open(destination,'w')
		log.info("Writing Makeflow to : {0}".format(destination))
	else:
		makeflow = sys.stdout
		log.info("Writing Makeflow to : STDOUT")
	try:
		if options.config:
			log.info("Reading Configuration from : {0}".format(configuration))
			config = open(configuration, 'r')
			makeflow.write(config.read())

		inputlist = ""
		outputlist = ""
		num_splits = count_splits(options.num_seq, options.query)

		for i in range(num_splits):
			inputlist = inputlist + str(i) + "."+ options.query + " "
			outputlist = outputlist + "output." + str(i)+ "."+ options.query + " "


		#Here we actually start generating the Makeflow
		makeflow.write("export LD_LIBRARY_PATH=hdf5")
		log.info("Writing Partitioning Rules")
		#How to get inputs
		makeflow.write("\n{0} : {1} {2}".format(inputlist, options.query, split_name))
		makeflow.write("\n\tLOCAL perl {0} {1} {2}".format(split_name, options.query, num_reads_per_split))

		#How to get outputs
		for i in range(num_splits):

			makeflow.write("\noutput." + str(i) + "." + options.query + ": "+ str(i) + "." + options.query + " "+  options.ref+ " blasr ./hdf5/ \n")
			makeflow.write("\t LD_LIBRARY_PATH=\"./hdf5/\" ./blasr "
					+ str(i)+ "."+ options.query
					+ " " + options.ref
					+" "+ ' '.join(args)
					+ " > " + "output."+ str(i)+ "." + options.query)

		#How to concatenate outputs
		log.info("Writing Join Rules")
		makeflow.write("\n{0} : {1}".format(options.output, outputlist))
		makeflow.write("\n\tLOCAL cat {0} > {1}\n".format(outputlist, options.output))

	finally:
		if options.makeflow is not None:
			makeflow.close()


    #Parse Command Line
parser = PassThroughParser()
parse_makeflow_options(parser)
parse_blasr_options(parser)
(options, args) = parser.parse_args()

if options.verbose == True:
	log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
        log.info("Verbose output.")
else:
        log.basicConfig(format="%(levelname)s: %(message)s")

if not options.ref:
	log.error("Reference fasta required : --ref")
	sys.exit(4)
else:
	log.info("Reference Fasta : {0}".format(options.ref))

if not options.query:
	log.error("Query fastq required : --query")
	sys.exit(4)
else:
	log.info("Query Fastq : {0}".format(options.query))

num_reads_per_split=50000

if options.num_seq:
	num_reads_per_split = int(options.num_seq)
log.info("Partition Size : {0}".format(num_reads_per_split))

split_name = "query_reduce"
if os.path.exists(split_name):
	os.remove(split_name)

write_split_reduce(split_name)
st = os.stat(split_name)

os.chmod(split_name, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
log.info("{0} made executable".format(split_name))

path = os.getenv("PATH")
path += os.pathsep + os.pathsep + "."

log.info("Searching PATH for Blasr")

blasr = search_file("blasr", path)
if os.path.exists("./blasr"):
		log.info("Blasr found locally")

elif blasr and not os.path.exists("./blasr"):
        log.info("Blasr linked from : " + blasr)
	os.symlink(blasr, "blasr")
elif not os.path.exists("./blasr"):
        log.error("Unable to find blasr")
        sys.exit(3)

log.info("Searching PATH for HDF5 library")

hdf5 = search_file("hdf5", path)
if os.path.exists("./hdf5"):
	log.info("HDF5 library found locally")

elif hdf5 and not os.path.exists("./hdf5"):
        log.info("HDF5 library linked from : "+ hdf5)
	os.symlink(hdf5_lib, "hdf5")

elif not os.path.exists("./hdf5"):
        log.error("Unable to find HDF5 library")
        sys.exit(3)


write_makeflow(options.makeflow, options.config, num_reads_per_split, split_name)
