#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Create BibTeX entry for IETF documents (RFCs and Internet Drafts)
# Copyright (C) 2014-2021 by Thomas Dreibholz
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Contact: dreibh@iem.uni-due.de
#

from xml.dom.minidom import parse
import re
import os
import sys
import csv
import codecs
import urllib.request


draftPrefix    = 'https://xml2rfc.tools.ietf.org/public/rfc/bibxml-ids-new/reference.I-D.'
rfcPrefix      = 'https://xml2rfc.tools.ietf.org/public/rfc/bibxml-rfcs/reference.RFC.'
verboseMode    = True
authorsFixFile = 'DEFAULT'


# Documents where meta-information XML is wrong:
blacklistTable = [
   'draft-ahmed-lssctp',
   'draft-ietf-rserpool-api',
   'draft-ietf-rserpool-applic',
   'draft-ietf-rserpool-arch',
   'draft-ietf-rserpool-comp',
   'draft-ietf-rserpool-mib',
   'draft-ietf-rserpool-tcpmapping',
   'draft-iyengar-sctp-cacc',
   'RFC3246'
]

monthIDTable   = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun',
                   'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ]
monthNameTable = [ 'January', 'February', 'March', 'April',
                   'May', 'June', 'July', 'August',
                   'September', 'October', 'November', 'December',
                   'INVALID' ]


# ====== Get BibTeX from IETF document ======================================
def getBibTeXFromIETFDocument(document):
   for blacklistEntry in blacklistTable:
      if document == blacklistEntry:
         sys.stderr.write('NOTE: ' + document + ' is blacklisted due to bad metadata in XML file. Skipping.\n')
         return

   issn         = None
   documentType = None
   number       = None
   institution  = None
   language     = 'english'
   note         = None

   url = ''
   m = re.match('^draft-(.*)$', document)
   if m != None:
      url          = draftPrefix + document + '.xml'
      documentType = 'Internet Draft'
      number       = m.group(1)
      m = re.match('^draft-ietf-(.*)$', document)
      if m != None:
         institution  = 'IETF'
      else:
         institution  = 'IETF, Individual Submission'
      # note = 'Work in Progress'

   if url == '':
      m = re.match('^[Rr][Ff][Cc]([0-9]*)$', document)
      if m != None:
         documentType = 'RFC'
         number       = int(m.group(1))
         issn         = '2070-1721'
         institution  = 'IETF'
         document     = documentType + str(number)
         url          = rfcPrefix + '{0:04d}'.format(number) + '.xml'

   if url == '':
      sys.stderr.write('ERROR: Unknown document type ' + document + '!\n')
      sys.exit(1)


   # ====== Get document information from XML file =============================
   if verboseMode == True:
      sys.stderr.write('Fetching ' + url + ' ...\n')
   try:
      xmlFile = urllib.request.urlopen(url)
      # print('Parsing XML ...')
      xmlDocument = parse(xmlFile)
   except Exception as e:
      sys.stderr.write('ERROR: Failed to get XML information from ' + url + ': ' + str(e) + '!\n')
      sys.exit(1)

   fullAuthorList   = []
   authorRegExpList = []
   authorList = xmlDocument.getElementsByTagName('author')
   if len(authorList) == 0:
      sys.stderr.write('ERROR: It seems that the XML file ' + url + ' does not exist. Wrong document name?\n')
      sys.exit(1)

   for author in authorList:
      #print(author)
      #print(author.attributes['fullname'].value)
      #print(author.attributes['initials'].value[0])
      #print(author.attributes['surname'].value)
      # ------ Try old format first -----------------------------------------
      try:
         authorRegExp = author.attributes['initials'].value[0] + '[a-zA-Z][a-zA-Z\s\.\- ]*' + \
                        author.attributes['surname'].value
         authorRegExpList.append(authorRegExp)
         fullAuthorList.append(author.attributes['initials'].value[0] +'. ' + \
                               author.attributes['surname'].value)
      # ------ Try new format -----------------------------------------------
      except:
         try:
            authorRegExp = author.attributes['fullname'].value
            authorRegExpList.append(authorRegExp)
            fullAuthorList.append(author.attributes['fullname'].value)
         except:
            continue
   # print(authorRegExpList)
   # print(fullAuthorList)

   dateList = xmlDocument.getElementsByTagName('date')
   month    = 0
   for monthName in monthNameTable:
      month = month + 1
      if monthName == dateList[0].attributes['month'].value:
         break
   try:
      day = int(dateList[0].attributes['day'].value)
   except:
      day = 0
   try:
      year = int(dateList[0].attributes['year'].value)
   except:   # Some RFCs have some garbage before the year content.
      year = dateList[0].attributes['year'].value
      l = len(year)
      year = int(year[l-4:l])
   # print(day,month,year)

   titleList = xmlDocument.getElementsByTagName('title')
   title = titleList[0].childNodes[0].data
   title = title.strip().replace(' (', '~(')
   title = title.strip().replace(' - ', ' -- ')
   # print(title)

   seriesInfoList = xmlDocument.getElementsByTagName('seriesInfo')
   series = None
   name   = None
   doi    = None
   for seriesInfo in seriesInfoList:
      if seriesInfo.attributes['name'].value in ['RFC', 'Internet-Draft']:
         series = seriesInfo.attributes['name'].value
         name   = seriesInfo.attributes['value'].value
      elif seriesInfo.attributes['name'].value == 'DOI':
         doi = seriesInfo.attributes['value'].value
   #print(series, name, doi)

   formatList = xmlDocument.getElementsByTagName('format')
   try:
      url = formatList[0].attributes['target'].value
   except:
      if documentType == 'RFC':
         url = 'https://tools.ietf.org/rfc/' + document.lower() + '.txt'
      else:
         url = 'https://tools.ietf.org/id/' + document + '.txt'
      # sys.stderr.write('WARNING: No target URL found, assuming ' + url + '\n')
   # print(url)

   # For I-Ds, use a secure URL instead of HTTP:
   url = url.replace('http://www.rfc-editor.org/rfc/',
                     'https://www.rfc-editor.org/rfc/')
   url = url.replace('http://www.ietf.org/internet-drafts/',
                     'https://tools.ietf.org/id/')
   # Fix redirect URL:
   url = url.replace('https://www.ietf.org/internet-drafts/',
                     'https://tools.ietf.org/id/')
   # print(url)

   abstractList = xmlDocument.getElementsByTagName('t')
   abstract     = None
   try:
      abstract = abstractList[0].childNodes[0].data
   except:
      pass
   # print('A=',abstract,".")

   if abstract != None:
      abstract = abstract.strip().replace('\t', ' ')
      abstract = abstract.strip().replace('\n', ' ')
      abstract = abstract.strip().replace(' (', '~(')
      abstract = abstract.strip().replace(' - ', ' -- ')
      abstract = abstract.strip().replace('i.e. ', 'i.e.\ ')
      abstract = abstract.strip().replace('e.g. ', 'e.g.\ ')
      abstract = abstract.strip().replace(' [STANDARDS-TRACK]','')
      abstract = re.sub(r'(RFC|STD)( )([0-9]+)', r'\1~\3', abstract.strip())

      abstract = abstract.replace('</t><t>', '\n')
      while True:
         oldLength = len(abstract)
         abstract = abstract.strip().replace('  ', ' ')
         newLength = len(abstract)
         if oldLength == newLength:
            break

      # print('A=',abstract,".")

   # ====== Get full author list from text file ================================
   if verboseMode == True:
      sys.stderr.write('Fetching ' + url + ' ...\n')
   try:
      txtFile = urllib.request.urlopen(url)
   except Exception as e:
      sys.stderr.write('ERROR: Failed to get document from ' + url + ': ' + str(e) + '!\n')
      sys.exit(1)

   inAuthorsSection = False
   foundDate        = False
   line             = txtFile.readline()
   lineNumber       = 0
   while line:
      lineNumber = lineNumber + 1

      # ====== RFC category =================================================
      if lineNumber < 30:
         if re.search('^Category:', line.decode().strip()):
            m = re.match('^Category: (.*)[ ][ ].*$', line.decode().strip())
            if m != None:
               category = m.group(1).strip()
               if documentType == 'RFC':
                  if category != 'Experimental':
                     documentType = category + ' RFC'
         if foundDate == False:
            m = re.match('^.*(January|February|March|April|May|June|July|August|September|October|November|December)( [0-9]+[,]|)[ ]*([0-9][0-9][0-9][0-9])$', line.decode().strip())
            # print(m,line.decode().strip())
            if m != None:
               foundDate = True

               # ====== Get year ============================================
               newYear = int(m.group(3))
               if newYear != year:
                  if verboseMode == True:
                     sys.stderr.write('Fix: Year ' + str(year) + ' -> ' + str(newYear) + '\n')
                  year = newYear

               # ====== Get month ===========================================
               newMonth = 0
               for monthName in monthNameTable:
                  newMonth = newMonth + 1
                  if monthName == m.group(1):
                     break
               if newMonth != month:
                  if verboseMode == True:
                     sys.stderr.write('Fix: Month ' + str(month) + ' -> ' + str(newMonth) + '\n')
                  month = newMonth

               # ====== Get day =============================================
               m2 = re.match('.* ([0-9]+) ' + monthNameTable[month - 1] + ' ' + str(year) + '$', line.decode().strip())
               if m2 != None:   #   Format: <Day> <Month>, <Year>
                  newDay = int(m2.group(1))
               else:   # Format: <Month> <Day>, <Year>
                  try:
                     newDay = int(m.group(2).replace(',', ''))
                  except:
                     newDay = 0
               if ((newDay != 0) and (newDay != day)):
                  if verboseMode == True:
                     sys.stderr.write('Fix: Day ' + str(day) + ' -> ' + str(newDay) + '\n')
                  day = newDay

      # ====== Authors section ==============================================
      if len(authorRegExpList) < 1:
         line = txtFile.readline()
         continue
      if inAuthorsSection == True:
         try:
            # Sometimes, there may be some strange letters inside -> decode fails.
            d = line.decode().strip()
         except:
            d=''   # Just ignore not-decodable stuff for now.

         # ------ Check for end of author section ---------------------------
         if re.search('^[0-9\.]*  ', d):
            # print('End of Author Section:', line)
            inAuthorsSection = False

         # ------ Parse author section --------------------------------------
         elif not re.search('^[\s]*$', d):
            i = 0
            # sys.stderr.write(line)
            for author in authorRegExpList:
               m = re.match('^(.*)(' + author + ')([\t ].*|,.*|)$', line.decode().strip())
               # print(line.strip(), i, m, author)
               if m != None:
                  if len(m.group(2)) >= len(fullAuthorList[i]):
                     fullAuthorList[i] = str(m.group(2))
                     # print('!!!', line.strip(), i, m, author)
               i = i + 1

      else:
         try:
            m = re.search('(^([0-9\.]*|[A-Z]\.)[ \t]+.*|^)(Editors|Authors|Authors\'[ \t]+Addresses|Author\'s[ \t]+Address|Editor\'s[ \t]+Address|Author[ \t]+Information).*$', line.decode().strip(), re.IGNORECASE)
            if m != None:
               # Check whether line is in table of contents
               m2 = re.search('\. \.[ \t]*[0-9]+$', line.decode().strip())
               if m2 == None:
                  # print('Begin of Author Section:', line)
                  inAuthorsSection = True
         except:
            pass

      line = txtFile.readline()

   # print('I=' + str(authorRegExpList))
   # print('F=' + str(fullAuthorList))


   # ====== Fix author list via author fix file ================================
   csv.register_dialect('authors-fix', delimiter=' ', quoting=csv.QUOTE_MINIMAL, skipinitialspace=True)
   inputCSV = None
   if authorsFixFile == 'DEFAULT':
      authorsFixFileList = [ 'authors-fix.list', '/usr/share/doc/bibtexconv/examples/authors-fix.list' ]
   elif authorsFixFile != None:
      authorsFixFileList = [ authorsFixFile ]
   else:
      authorsFixFileList = [ ]
   for authorsFixFileName in authorsFixFileList:
      if os.path.exists(authorsFixFileName):
         try:
            inputFile = codecs.open(authorsFixFileName, 'r', 'utf-8')
            inputCSV  = csv.reader(inputFile, dialect='authors-fix')
            break
         except:
            pass

   if ((authorsFixFile != None) and (inputCSV == None)):
      sys.stderr.write('ERROR: Unable to load authors fix file. Use option --authors-fix-file=FILE or --no-authors-fix-file!\n')
      sys.exit(1)
   elif inputCSV != None:
      if verboseMode == True:
         sys.stderr.write('NOTE: Using authors fix file ' + authorsFixFileName + '\n')

      for row in inputCSV:
         for i in range(0, len(fullAuthorList), 1):
            if row[0] == fullAuthorList[i]:
               if verboseMode == True:
                  sys.stderr.write('Fix: ' + fullAuthorList[i] + ' -> ' + row[1] + '\n')
               fullAuthorList[i] = row[1]
               break

   # print(authorRegExpList)
   # print(fullAuthorList)


   # ====== Check ===========================================================
   if foundDate == False:
      sys.stderr.write('WARNING: Unable to find date of document ' + document + '!\n')
   # for i in range(1, len(authorRegExpList)):
   #    if authorRegExpList[i] == fullAuthorList[i]:
   #      sys.stderr.write('WARNING: Unable to obtain full author name for "' + str(authorRegExpList[i]) + '" from list!\n')


   # ====== Print result ====================================================
   sys.stdout.write('@TechReport{ ' + document + ',\n')
   sys.stdout.write('\tauthor = "')
   isFirstAuthor = True
   for author in fullAuthorList:
      if isFirstAuthor == True:
        isFirstAuthor = False
      else:
         sys.stdout.write(' and ')
      sys.stdout.write(author)
   sys.stdout.write('",\n')

   sys.stdout.write('\ttitle = "{' + title + '}",\n')
   sys.stdout.write('\ttype = "' + documentType + '",\n')
   sys.stdout.write('\tnumber = "' + name + '",\n')

   if day > 0:
      sys.stdout.write('\tday = "' + str(day) + '",\n')
   if ((month >= 1) and (month <= 12)):
      sys.stdout.write('\tmonth = ' + monthIDTable[month - 1] + ',\n')
   sys.stdout.write('\tyear = "' + str(year) + '",\n')

   sys.stdout.write('\tinstitution = "' + institution + '",\n')
   sys.stdout.write('\tlanguage = "' + language + '",\n')
   if issn != None:
      sys.stdout.write('\tissn = "' + issn + '",\n')
   if doi != None:
      sys.stdout.write('\tdoi = "' + doi + '",\n')
   if note != None:
      sys.stdout.write('\tnote = "' + note + '",\n')
   if abstract != None:
      sys.stdout.write('\tabstract = "{' + abstract + '}",\n')
   sys.stdout.write('\turl = "' + url + '"\n')

   sys.stdout.write('}\n\n')



# ###### Handle arguments ###################################################
if len(sys.argv) < 2:
   sys.stderr.write('Usage: ietf2bibtex [--quiet|--verbose] [--authors-fix-file=FILE|--no-authors-fix-file] document ...\n')
   sys.exit(1)

for i in range(1, len(sys.argv)):
   if sys.argv[i] == '--quiet':
      verboseMode = False
   elif sys.argv[i] == '--verbose':
      verboseMode = True
   elif sys.argv[i] == '--no-authors-fix-file':
      authorsFixFile = None
   elif sys.argv[i][0:19] == '--authors-fix-file=':
      authorsFixFile = sys.argv[i][19:]
      i = i + 1
   else:
      getBibTeXFromIETFDocument(sys.argv[i])
