Source code for pycite.pycite

import os
import re
from urllib.error import HTTPError, URLError
from urllib.request import urlopen, Request
import tempfile 
import bs4

from . import ncbi, pubmed, sciencedirect
import time

# Use as access date (can be manually changed)
date_today = time.strftime("%d/%b/%Y")


[docs]def match_source(input_line):
    return re.findall("pubmed|ncbi|jstor|sciencedirect", input_line)


[docs]def switch_method(input_line, input_file, output_file, cit_list, bs4_link, **kwargs):
    # Write a dict to get the relevant method, do this only once.
    # This avoids writing several nested if statements and is probably easier to debug/refactor
    methods = {"pubmed": pubmed.pubmed_final_citation,
               "ncbi": ncbi.ncbi_final_citation,
               "sciencedirect": sciencedirect.sd_final_citation}
            #    "jstor": jstor.jstor_citation}

    use_method = match_source(input_line)[0]

    # The above should throw an index error but for whatever reason it does not with []
    actual_method = methods[use_method](bs4_link, **kwargs) if use_method == "pubmed" else methods[use_method](bs4_link)
    # Only get a method if it exists
    # if not use_method:
    #  warn (f"No suitable method found for {input_line},skipping....")
    if use_method in methods.keys():
        print(f"{input_line} in {input_file.name} is a(n) {use_method} link, using {use_method} methods")
        # output_file.write(f"{actual_method}\n") We write to the output file in the cite method now
        # The switch method now only creates a list with all the citations
        # Add date accessed
        cit_list.append(actual_method + " [Accessed " + date_today + "]")


[docs]class PyCite(object):
    def __init__(self, input_file, output_file, show_doi=False):
        """
        :param input_file A file containing links to papers to cite.
        :param output_file A file/filename to write citations to.
        :param show_doi Boolean to control if DOIs should be included in citations. Defaults to False.
        :return An object of class PyCite
        """
        self.input_file = input_file
        self.output_file = output_file
        self.show_doi = show_doi

        # Assert file existence
        for _file in [self.input_file, self.output_file]:

            try:
                assert os.path.isfile(_file), f"{_file} does not exist"
            except AssertionError:
                # Perhaps check for specific OS Errors eg not a file error, etc?
                # Using an assertion error seems simple but may be less specific.
                raise FileNotFoundError(f"{_file} does not exist")
            else:
                # Get format of file, for now only support txt files
                file_format = re.findall("\\.(\\w+)", _file)
                if file_format:
                    file_format = file_format[0]
                    try:
                        assert file_format == "txt", f"Only txt files supported for now, not {file_format}"
                    except AssertionError:
                        raise
                    else:
                        pass
                else:
                    raise ValueError(f"No file format was detected in {_file}, exiting...")

[docs]    def cite(self):
        final_citations = []
        # This is useful for tests (we use tempfile) to avoid permission denied errors, on Window$
        use_out_file = self.output_file if isinstance(self.output_file, 
                                tempfile._TemporaryFileWrapper) else open(self.output_file, "w")
        with open(self.input_file, "r") as in_file, use_out_file as out_file:
            for line in in_file:
                # Assume that links are inputted as lines in the input file

                try:
                    # Running curl works but not requests, no idea why
                    # curl -I "https://www.jstor.org/stable/26469531" --user-agent "Mozilla/5.0"
                    use_agent = {'User-Agent': 'Mozilla/5.0'} if "jstor" in line else {'User-Agent': 'XYZ/3.0'}
                    paper_link = urlopen(Request(line, headers=use_agent))
                    # print(paper_link.headers)
                    # TODO: Jstor citations work locally but not remote, temporarily disabling jstor tests.
                    # match_source(line)[0]
                except HTTPError as err:
                    raise ValueError(f"{line} not reachable, code: {str(err.code)}")
                except URLError as err:
                    raise ValueError(f"{line} not reachable, reason: {str(err.reason)}")
                else:
                    # Convert to a BS4 object
                    bs4_link = bs4.BeautifulSoup(paper_link, features="html.parser")
                    switch_method(line, in_file, out_file, final_citations, bs4_link, show_doi=self.show_doi)
                    continue
            final_citations.sort()  # Sorting the final citations list

            for cit in final_citations:
                # Writing the sorted citations to the output file.
                out_file.write(f"{cit}\n")
        return final_citations
Source code for pycite.pycite

pycite

Navigation

Related Topics