Heroku H13 desc="Connection closed without response" for file (pdf) upload

35 views Asked by At

I have a use case where the user can upload a pdf book and I process a summary for that book. For a 250-page book, for example, the summary process takes around 2-3 minutes and it works great on local. However, in Heroku, for that 250-page book, I get the error below. When I try the 20-page book, it's okay. I run Heroku using professional dynos. I have 3 dynos. I have also shared my views on how files are processed. There you will see libraries that I use as well.

I read a bit that Heroku has a hard cap of 30 seconds, ie if 30 seconds passed and the summary is not done, boom, error. Is that true? How can I avoid that?

Thank you in advance for your help.

error:

error in server: 2023-08-15T09:12:55.850512+00:00 app[web.1]: [2023-08-15 09:12:55 +0000] [98] [INFO] Worker exiting (pid: 98)
2023-08-15T09:12:55.848850+00:00 heroku[router]: at=error code=H13 desc="Connection closed without response" method=POST path="/file_processing/upload/" host=someendpoint request_id=b8045e24-a33f-4446-869e-e22a27fbef32 fwd="109.245.202.232,141.101.96.16" dyno=web.1 connect=0ms service=30471ms status=503 bytes=0 protocol=https

Procfile

web: gunicorn backend.wsgi

views.py

from django.http import JsonResponse
from rest_framework.views import APIView
from rest_framework.parsers import MultiPartParser, FormParser
from .serializers import DocumentSerializer
from docx import Document as DocxDocument
from pdfminer.high_level import extract_text
import pytesseract
from PIL import Image
import traceback
import os
import tempfile
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
import warnings
from django.conf import settings



if os.path.exists(settings.TOKENIZER_PATH):
   print("Tokenizer exists at:", settings.TOKENIZER_PATH)
else:
    print("Tokenizer not found!")




def generate_summary(text, char_limit=500):
parser = PlaintextParser.from_string(text, Tokenizer("english"))

# First, attempt to use LSA for summarization
try:
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        summarizer = LsaSummarizer()
        summary = summarizer(parser.document, 3)
        summarized_text = " ".join([str(sentence) for sentence in summary])

        # Check if a warning was raised during the LSA summarization
        if any(item.category == UserWarning for item in w):
            raise UserWarning
except UserWarning:
    # If LSA raises a warning, try using LexRank
    try:
        summarizer = LexRankSummarizer()
        summary = summarizer(parser.document, 3)
        summarized_text = " ".join([str(sentence) for sentence in summary])
    except Exception:
        # If LexRank fails, try using Luhn
        summarizer = LuhnSummarizer()
        summary = summarizer(parser.document, 3)
        summarized_text = " ".join([str(sentence) for sentence in summary])

if len(summarized_text) > char_limit:
    return summarized_text[:char_limit]

return summarized_text


class FileUploadView(APIView):
  parser_classes = (MultiPartParser, FormParser)

def extract_from_docx(self, file_obj):
    doc = DocxDocument(file_obj)
    return '\n'.join([paragraph.text for paragraph in doc.paragraphs])

def extract_from_pdf(self, file_path):
    return extract_text(file_path)

def extract_from_image(self, file_obj):
    image = Image.open(file_obj)
    return pytesseract.image_to_string(image)

def post(self, request, *args, **kwargs):
    try:
        file_serializer = DocumentSerializer(data=request.data)
        if file_serializer.is_valid():
            document_instance = file_serializer.save()
            uploaded_file = document_instance.uploaded_file

            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
                for chunk in uploaded_file.chunks():
                    temp_file.write(chunk)

            if uploaded_file.name.endswith('.pdf'):
                data = self.extract_from_pdf(temp_file.name)
                os.remove(temp_file.name)
            elif uploaded_file.name.endswith('.docx'):
                with uploaded_file.open('rb') as file:
                    data = self.extract_from_docx(file)
                # Add the '.jpeg' extension check here
            elif uploaded_file.name.endswith('.jpg') or uploaded_file.name.endswith(
                    '.jpeg') or uploaded_file.name.endswith('.png'):
                with uploaded_file.open('rb') as file:
                    data = self.extract_from_image(file)
            else:
                return JsonResponse({'error': 'Unsupported file type'}, status=400)

            if len(data) > 5000:
                data = generate_summary(data)  # Remove the summarizer_type argument

            return JsonResponse({'file_content': data}, safe=False)
        else:
            return JsonResponse(file_serializer.errors, status=400)
    except Exception as e:
        print("Error in FileUploadView:", str(e))
        print(traceback.format_exc())
        return JsonResponse({'error': 'An unexpected error occurred.'}, status=500)
0

There are 0 answers