Merge pull request #5 from BlipRanger/testing

1.2 Merge
BlipRanger · May 11, 2021 · fc03f8f · fc03f8f
2 parents 2e021e1 + cfda1c6
commit fc03f8f
Show file tree

Hide file tree

Showing 6 changed files with 221 additions and 40 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,30 @@
+FROM python:3.9
+
+RUN apt-get update
+RUN apt-get install ffmpeg -y
+
+COPY ./requirements.txt requirements.txt
+COPY ./bdfrToHTML.py bdfrToHTML.py
+COPY ./style.css style.css
+COPY ./start.py start.py
+
+ENV BDFR_FREQ=15
+ENV BDFR_IN=/input
+ENV BDFR_OUT=/output
+ENV BDFR_RECOVER_COMMENTS=True
+ENV BDFR_ARCHIVE_CONTEXT=True
+ENV BDFR_LIMIT=1100
+ENV RUN_BDFR=False
+ENV BDFRH_DELETE=False
+ENV BDFRH_LOGLEVEL=INFO
+
+EXPOSE 5000
+EXPOSE 7634
+
+RUN pip install -r requirements.txt
+
+RUN mkdir input
+RUN mkdir output
+RUN mkdir config
+
+CMD python start.py
diff --git a/README.md b/README.md
@@ -8,14 +8,25 @@ Currently only supports the json version of the archive output from BDfR V2.
 
 `python3 bdfrToHTML.py --input ./location/of/archive/and/downloads --output /../html/`
 
+Use `python3 bdfrToHTML.py --help` for a list of options
+
+**Docker-Compose**
+
+For ease of use for both bdfr and bdfr-html in an automated fashion, I have included a docker-compose file which will spin up both an automation container and a web server container. The automation container will run bdfr and then subsequently bdfr-html, producing a volume or mounted folder containing the generated html files. The web server container shares the output volume and hosts the generated files. Currently this is tasked to only save "Saved" user content, however this might be changed in the future. If you would prefer to populate bdfr-html with your own reddit json/media files from bdfr, you can use a similar docker-compose file, but mount the folder where you have saved your content to the `BDFR_IN` folder (/input by default) and set the env variable `RUN_BDFR` to false (default). 
+
+For the time being, you will need to authenticate bdfr by either putting a copy of your config file into the mounted config folder or browsing to the links generated by bdfr which will be found in the container log files and running `docker exec -it bdfr-html wget "[insert resulting url here]"` with a copy of the resulting auth url. 
+
+To run the compose file, simply clone this repo and run `docker-compose up`. 
+
 **Additional Features**
 
 - Use the --archive_context option to pull the related contextual post for downloaded comments (requires BDfR in the same folder).
 - Use the --recover_comments option to have the script attempt to pull deleted comments from Pushshift. 
+- Script can be instructed to watch the input folder using --watch_folder and --watch_freq
+- The script now actively avoids reprocessing inputs by storing a list of processed ids in the output folder
 
 **Planned Features**
 
 - Using Pushshift to pull deleted post contents 
 - Adding an optional archiver to archive webpages linked in posts
-- Docker support with built in webserver and BDfR
 - Possible static-site generation
diff --git a/bdfrToHTML.py b/bdfrToHTML.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 __author__ = "BlipRanger"
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 __license__ = "GNU GPLv3"
 
 import json
@@ -13,9 +13,14 @@
 import requests
 import logging
 import subprocess
+import time
 
 #Logging Setup
-logging.basicConfig(level=logging.INFO)
+level = os.environ['BDFRH_LOGLEVEL']
+if level == "DEBUG":
+    logging.basicConfig(level=logging.DEBUG)
+else:
+    logging.basicConfig(level=logging.INFO)
 
 
 #Globals
@@ -30,16 +35,28 @@ def loadJson(file_path):
     data = json.load(f)
     f.close()
     logging.debug('Loaded ' + file_path)
-    data['htmlPath'] = writeToHTML(data)
     return data
 
 #Search the input folder for media files containing the id value from an archive
 #Inputs: id name, folder to search
 def findMatchingMedia(name, folder):
     paths = []
+
+    #Don't copy if we already have it
+    existingMedia = os.path.join(outputFolder, "media/")
+    for dirpath, dnames, fnames in os.walk(existingMedia):
+        for f in fnames:
+            if (name) in f and not f.endswith('.json'):
+                logging.debug("Find Matching Media found: " + dirpath + f)
+                paths.append(os.path.join('media/', f))
+    if len(paths) > 0:
+        logging.info("Existing media found for " + name)
+        return paths 
+
     for dirpath, dnames, fnames in os.walk(folder):
         for f in fnames:
             if (name) in f and not f.endswith('.json'):
+                logging.debug("Find Matching Media found: " + dirpath + f)
                 paths.append(copyMedia(os.path.join(dirpath, f), f))
     return paths
 
@@ -54,6 +71,7 @@ def buildGallery(paths):
 
 #Extract/convert markdown from saved selfpost
 def parseSelfPost(filePath):
+    logging.debug("Parsing selfpost for " + filePath)
     txt = '<div>'
     with open(filePath, 'r') as file:
         content = file.read()
@@ -65,6 +83,7 @@ def parseSelfPost(filePath):
 #Handle the html formatting for images, videos, and text files
 #Input: list of paths to media
 def formatMatchingMedia(paths):
+    logging.debug("Formatting media for " + str(paths))
     if paths is None:
         return ""
     if len(paths) == 1:
@@ -73,16 +92,16 @@ def formatMatchingMedia(paths):
         if path.endswith('jpg') or path.endswith('jpeg') or path.endswith('png') or path.endswith('gif'):
             return '<a href={path}><img src={path}></a>'.format(path=path)
         elif path.endswith('m4a') or path.endswith('mp4') or path.endswith('mkv'):
-            return '<video max-height="500" controls><source src="{path}"></video>'.format(path=path)
+            return '<video max-height="500" controls preload="metadata"><source src="{path}"></video>'.format(path=path)
         elif path.endswith('txt'):
-            return parseSelfPost(outputFolder + path)
+            return parseSelfPost(os.path.join(outputFolder, path))
     elif(len(paths) > 1):
         return buildGallery(paths)
     return ""
 
 #Copy media from the input folder to the file structure of the html pages
 def copyMedia(mediaPath, filename):
-    writeFolder = outputFolder + 'media/' 
+    writeFolder = os.path.join(outputFolder, 'media/')
     assure_path_exists(writeFolder)
     if filename.endswith('mp4'):
         try:
@@ -91,7 +110,7 @@ def copyMedia(mediaPath, filename):
         except:
             logging.error('FFMPEG failed')
     else:
-        shutil.copyfile(mediaPath, writeFolder + filename)
+        shutil.copyfile(mediaPath, os.path.join(writeFolder, filename))
     logging.debug('Moved ' + mediaPath + ' to ' + writeFolder +filename)
     return 'media/' + filename
 
@@ -125,20 +144,19 @@ def recoverDeletedComment(comment):
 
 #Requires bdfr V2
 def archiveContext(link):
-    assure_path_exists(inputFolder + "context/")
+    path = os.path.join(inputFolder, "context/")
+    assure_path_exists(path)
     data={}
     try:
-        logging.debug("python3.9 -m bulkredditdownloader archive -l '{link}' {folder}".format(link=link, folder=inputFolder + "context"))
-        logging.debug("python3.9 -m bulkredditdownloader download -l '{link}' --file-scheme  \'{{POSTID}}\' {folder}".format(link=link, folder=inputFolder + "context"))
-        subprocess.call(["python3.9", "-m", "bulkredditdownloader", "archive", "-l", link, inputFolder + "context"])
-        subprocess.call(["python3.9", "-m", "bulkredditdownloader", "download", "-l", link, "--file-scheme", "{{POSTID}}", inputFolder + "context"])
+        subprocess.call(["python3.9", "-m", "bdfr", "archive", "-l", link, path])
+        subprocess.call(["python3.9", "-m", "bdfr", "download", "-l", link, "--file-scheme", "{POSTID}", path])
     except:
         logging.error("Failed to archive context")
     for dirpath, dnames, fnames in os.walk(inputFolder + "context"):
             for f in fnames:
-                print(f)
                 if f.endswith(".json"):
                     data = loadJson(os.path.join(dirpath, f))
+                    data['htmlPath'] = writeToHTML(data)
     shutil.rmtree(inputFolder + "context/")
     return data['htmlPath']
 
@@ -238,7 +256,8 @@ def getSubreddit(permalink):
 #Write html file from given post archive info
 def writeToHTML(data):
     file_path = data['id'] + '.html'
-    with open(outputFolder + file_path, 'w') as file:
+    path = os.path.join(outputFolder, file_path)
+    with open(path, 'w') as file:
         html = writeHead()
         if data.get('parent_id', None) is None:
             html = html + writePost(data) + "<h2>Comments</h2><div class=comments>"
@@ -266,38 +285,33 @@ def assure_path_exists(path):
 #Main function, loops through json files in input folder, extracts archive data into dict, 
 #formats/writes archive data and media to html files, creates a single index.html file with 
 #links to all archived posts. 
-@click.command()
-@click.option('--input', default='.', help='The folder where the download and archive results have been saved to')
-@click.option('--output', default='./html/', help='Folder where the HTML results should be created.')
-@click.option('--recover_comments', default=False, help='Should we attempt to recover deleted comments?')
-@click.option('--archive_context', default=False, help='Should we attempt to archive the contextual post for saved comments?')
-def converter(input, output, recover_comments, archive_context):
-    global inputFolder
-    global outputFolder
-    global recoverComments
-    global context
-
-    #Set globals (there is probably a better way to do this)
-    inputFolder = input
-    outputFolder = output
-    recoverComments = recover_comments
-    context = archive_context
-
+def main():
+
     #Begin main process
-    assure_path_exists(output)
+    datalist = []
+
+    file_path = os.path.join(outputFolder, 'idList.txt')
+    if os.path.isfile(file_path):
+        with open(file_path, 'r') as f:
+            datalist = list(f)
+
+    assure_path_exists(outputFolder)
+    if not os.path.isdir(inputFolder):
+        raise ValueError('Input folder does not exist') 
     html = writeHead()
     postCount = 0
     pageCount = 1
-    for dirpath, dnames, fnames in os.walk(input):
+    for dirpath, dnames, fnames in os.walk(inputFolder):
         for f in fnames:
             if f.endswith(".json"):
                 data = loadJson(os.path.join(dirpath, f))
+                data['htmlPath'] = writeToHTML(data)
                 if postCount == 25:
-                    file_path = output + '/page{pageCount}.html'.format(pageCount=pageCount)
+                    file_path = os.path.join(outputFolder, 'page{pageCount}.html'.format(pageCount=pageCount))
                     with open(file_path, 'w') as file:
                         html = html + """<div class=footer><div class=previousPage><a href='page{previous}.html'>Previous Page</a></div>
-                         <div class=nextPage><a href='page{next}.html'>Next Page</a></div></div>
-                         </body>
+                        <div class=nextPage><a href='page{next}.html'>Next Page</a></div></div>
+                        </body>
                 </html>""".format(previous=pageCount-1, next=pageCount+1)
                         file.write(html)
                     html = writeHead()
@@ -308,8 +322,9 @@ def converter(input, output, recover_comments, archive_context):
                 else:
                     html = html + '<a href={local_path}>{post}</a>'.format(post=writeCommentPost(data), local_path=data['htmlPath'])
                 postCount = postCount + 1
+                datalist.append(data['id'] + "\n")
 
-    file_path = output + '/page{pageCount}.html'.format(pageCount=pageCount)
+    file_path = os.path.join(outputFolder, 'page{pageCount}.html'.format(pageCount=pageCount))
     with open(file_path, 'w') as file:
         html = html + """<div class=footer><div class=previousPage><a href='page{previous}.html'>Previous Page</a></div>
                          <div class=nextPage><a href='page{next}.html'>Next Page</a></div></div>
@@ -319,13 +334,71 @@ def converter(input, output, recover_comments, archive_context):
     html = writeHead()
 
 
-    file_path = output + '/index.html'
+    file_path = os.path.join(outputFolder, 'index.html')
     with open(file_path, 'w') as file:
         html = html + """
         <meta http-equiv="refresh" content="0; URL='page1.html'" /></div></body>
                 </html>"""    
         file.write(html)
-    shutil.copyfile('style.css', outputFolder + 'style.css')
+
+    file_path = os.path.join(outputFolder, 'idList.txt')
+    with open(file_path, 'w') as file:
+        file.writelines(datalist)
+
+    shutil.copyfile('style.css', os.path.join(outputFolder, 'style.css'))
+    logging.info("Run Complete!")
+
+
+@click.command()
+@click.option('--input', default='.', help='The folder where the download and archive results have been saved to')
+@click.option('--output', default='./html/', help='Folder where the HTML results should be created.')
+@click.option('--recover_comments', default=False, type=bool, help='Should we attempt to recover deleted comments?')
+@click.option('--archive_context', default=False, type=bool, help='Should we attempt to archive the contextual post for saved comments?')
+@click.option('--watch_folder', default=False, type=bool, help='After the first run, watch the input folder for changes and rerun when detected')
+@click.option('--watch_freq', default=1, help='How often should we recheck the watched input folder in minutes. Requires watch_folder be enabled')
+@click.option('--delete_input', default=False, type=bool, help='Should we delete the input after creating the output?')
+def converter(input, output, recover_comments, archive_context, watch_folder, watch_freq, delete_input):
+    global inputFolder
+    global outputFolder
+    global recoverComments
+    global context
+
+    #Set globals (there is probably a better way to do this)
+    inputFolder = os.path.join(input, '')
+    outputFolder = os.path.join(output, '')
+    recoverComments = (recover_comments)
+    context = (archive_context)
+    delete_input = (delete_input)
+
+    logging.debug("Recover Comments: " + str(recoverComments))
+    logging.debug("Recover Context: " + str(context))
+
+    #Simple watch function
+    if watch_folder:
+        oldContent = []
+        logging.info("Watching...")
+        while True:
+            content = []
+            for dirpath, dnames, fnames in os.walk(inputFolder):
+                for f in fnames:
+                    content.append(f)
+                for d in dnames:
+                    content.append(d)
+            if content != oldContent:
+                logging.info("Content found!")
+                main()
+            else:
+                logging.info("Nothing new, sleeping for " + str(watch_freq) + " minutes.")
+                time.sleep(watch_freq * 60)
+            oldContent = content
+    else:
+        main()
+
+    if delete_input:
+        for root, dirs, files in os.walk(inputFolder):
+            for file in files:
+                os.remove(os.path.join(root, file))
+
 
 if __name__ == '__main__':
     converter()
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,28 @@
+version: '3'
+services:
+  bdfr-html:                                #Container which runs both bdfr and bdfr-html
+    container_name: bdfr-html
+    build: .                                #Builds from current folder, could also build from git repo
+    environment:                            #Defaults are defined in the Dockerfile
+      - BDFR_LIMIT=15                       #Number of saved posts to download
+      - BDFR_RECOVER_COMMENTS=False         #Should bdfr-html try to recover deleted comments from pushshift?
+      - BDFR_FREQ=15                        #How often should we pull saved content from reddit
+      - BDFR_OUT=/output                    #What folder internal to the container should the created files go
+      - BDFR_IN=/input                      #Where should bdfr place files pulled from reddit
+      - BDFR_ARCHIVE_CONTEXT=True           #Should bdfr-html also get post associated with saved comments
+      - RUN_BDFR=True                       #Run bdfr each time before running bdfr-html
+      - BDFRH_DELETE=False                  #Delete input media files after copying them to the media folder
+      - BDFRH_LOGLEVEL=INFO                 #Either INFO or DEBUG for more verbose logs
+    volumes:
+      - output:/output                      #Shared docker volume between the script container and the webserver, could also be mounted
+      - ./config:/root/.config/bdfr/        #Mounted folder where the bdfr config file lives
+  apache:                                   #Standard webserver serving the content located in the BDFR_OUT var
+    image: httpd:alpine
+    container_name: bdfr-server
+    ports:
+      - "80:80"                             #Ports can be redefined as needed
+    volumes:
+      - output:/usr/local/apache2/htdocs/   #This should match the output volume or mount in the script container
+
+volumes:
+  output:
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,11 @@
+click==7.1.2
+Markdown==3.3.4
+appdirs>=1.4.4
+bs4>=0.0.1
+dict2xml>=1.7.0
+ffmpeg-python>=0.2.0
+praw>=7.2.0
+pyyaml>=5.4.1
+requests>=2.25.1
+youtube-dl>=2021.3.14
+bdfr==2.1.0