nlp¶

API reference for CivicLen's natural language processing toolkit.

Reference¶

`comments` ¶

`assign_clusters(df, clusters)` ¶

Inserts cluster info into the polars df of data from the initial pull

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	df from initial pull	required
`clusters`	`list[set[int]]`	clusters from Louvain Communities	required

Returns:

Type	Description
`DataFrame`	pl.DataFrame: updated df

Source code in civiclens/nlp/comments.py

def assign_clusters(df: pl.DataFrame, clusters: list[set[int]]) -> pl.DataFrame:
    """Inserts cluster info into the polars df of data from the initial pull

    Args:
        df (pl.DataFrame): df from initial pull
        clusters (list[set[int]]): clusters from Louvain Communities

    Returns:
        pl.DataFrame: updated df
    """
    rows = df.shape[0]
    # go through clusters and add that info to df
    for i, cluster_data in enumerate(clusters):
        df = df.with_columns(
            pl.when(pl.arange(0, rows).is_in(cluster_data))
            .then(i)
            .otherwise(pl.col("cluster"))
            .alias("cluster")
        )

    return df

`build_graph(df)` ¶

Builds a network graph with comments as nodes and their similarities as weights

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	df with pairs of comment indices and a cosine similarity	required

Returns:

Type	Description
`Graph`	nx.Graph:network graph with comments as nodes and their similarities as weights

Source code in civiclens/nlp/comments.py

def build_graph(df: pl.DataFrame) -> nx.Graph:
    """Builds a network graph with comments as nodes and their similarities as
    weights

    Args:
        df (pl.DataFrame): df with pairs of comment indices and a cosine
            similarity

    Returns:
        nx.Graph:network graph with comments as nodes and their similarities as
            weights
    """
    graph_data = df.to_dicts()
    G = nx.Graph()
    for edge in graph_data:
        G.add_edge(edge["idx1"], edge["idx2"], weight=edge["similarity"])

    return G

`comment_similarity(df, model)` ¶

Create df with comment mappings and their semantic similarity scores according to the SBERT paraphrase mining method using the all-mpnet-base-v2 model from hugging face.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	df with comment data	required
`model`	`SentenceTransformer`	sbert sentence transformer model	required

Returns:

Type	Description
`DataFrame`	df_paraphrase, df_form_letter (tuple[pl.DataFrame]): cosine similarities for form letters and non form letters

Source code in civiclens/nlp/comments.py

def comment_similarity(
    df: pl.DataFrame, model: SentenceTransformer
) -> pl.DataFrame:
    """Create df with comment mappings and their semantic similarity scores
    according to the SBERT paraphrase mining method using the all-mpnet-base-v2
    model from hugging face.

    Args:
        df (pl.DataFrame): df with comment data
        model (SentenceTransformer): sbert sentence transformer model

    Returns:
        df_paraphrase, df_form_letter (tuple[pl.DataFrame]): cosine
            similarities for form letters and non form letters
    """
    paraphrases = util.paraphrase_mining(
        model, df["comment"].to_list(), show_progress_bar=True
    )
    df_full = pl.DataFrame(
        {
            "similarity": pl.Series(
                "similarity", [x[0] for x in paraphrases], dtype=pl.Float64
            ),
            "idx1": pl.Series(
                "idx1", [x[1] for x in paraphrases], dtype=pl.Int64
            ),
            "idx2": pl.Series(
                "idx2", [x[2] for x in paraphrases], dtype=pl.Int64
            ),
        }
    )

    df_paraphrases = df_full.filter(pl.col("similarity") <= 0.99)
    df_paraphrases = df_paraphrases.with_columns(
        pl.lit(False).alias("form_letter")
    )

    df_form_letter = df_full.filter(pl.col("similarity") > 0.99)
    df_form_letter = df_form_letter.with_columns(
        pl.lit(True).alias("form_letter")
    )

    return df_paraphrases, df_form_letter

`compute_similiarity_clusters(embeds, sim_threshold)` ¶

Extract form letters from corpus of comments.

Parameters:

Name	Type	Description	Default
`embeds`	`ndarray`	array of embeddings representing the documents	required
`sim_threshold`	`float`	distance thresholds to divide clusters	required

Returns:

Type	Description
`ndarray`	Array of docs by cluster

Source code in civiclens/nlp/comments.py

def compute_similiarity_clusters(
    embeds: np.ndarray, sim_threshold: float
) -> np.ndarray:
    """
    Extract form letters from corpus of comments.

    Args:
        embeds: array of embeddings representing the documents
        sim_threshold: distance thresholds to divide clusters

    Returns:
        Array of docs by cluster
    """
    kmeans = AgglomerativeClustering(
        n_clusters=None,
        metric="cosine",
        linkage="average",
        distance_threshold=sim_threshold,
    )
    kmeans.fit(embeds)

    return kmeans.labels_

`count_unique_comments(df)` ¶

Counts number of unique comments identified by performing paraphrasing mining on a corpus of comments.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe of similiar comments	required

Source code in civiclens/nlp/comments.py

def count_unique_comments(df: pl.DataFrame) -> int:
    """
    Counts number of unique comments identified by performing paraphrasing
    mining on a corpus of comments.

    Args:
        df: dataframe of similiar comments
    """
    indices = df["idx1"].to_list() + df["idx2"].to_list()

    return len(set(indices))

`find_central_node(G, clusters)` ¶

Find the most representative comment in a cluster by identifying the most central node

Parameters:

Name	Type	Description	Default
`G`	`Graph`	network graph with comments as nodes and their similarities as weights	required
`clusters`	`list[set[int]]`	clusters from Louvain Communities	required

Returns:

Name	Type	Description
`dict`	`dict`	dictionary with the central comment id as the key and the degree centrality as the value

Source code in civiclens/nlp/comments.py

def find_central_node(G: nx.Graph, clusters: list[set[int]]) -> dict:
    """Find the most representative comment in a cluster by identifying the
    most central node

    Args:
        G (nx.Graph): network graph with comments as nodes and their
            similarities as weights
        clusters (list[set[int]]): clusters from Louvain Communities

    Returns:
        dict: dictionary with the central comment id as the key and the degree
            centrality as the value
    """
    centrality_per_cluster = {}
    for cluster in clusters:
        # focus on each specific cluster of comments
        subgraph = G.subgraph(cluster)
        # calculate the centrality of each comment in the cluster
        centralities = nx.degree_centrality(subgraph)
        # Find the node with the highest centrality
        central_node = max(centralities, key=centralities.get)
        centrality_per_cluster[central_node] = centralities[central_node]

    return centrality_per_cluster

`find_form_letters(df, model, form_threshold)` ¶

Finds and extracts from letters by clustering, counts number of unique comments.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe of comments to extract form letters from	required
`model`	`SentenceTransformer`	vectorize model for text embeddings	required
`form_threshold`	`int`	threshold to consider a comment a form letter	required

Returns:

Type	Description
`tuple[list[dict], int]`	List of form letters, number of unique comments

Source code in civiclens/nlp/comments.py

def find_form_letters(
    df: pl.DataFrame, model: SentenceTransformer, form_threshold: int
) -> tuple[list[dict], int]:
    """
    Finds and extracts from letters by clustering, counts number of unique
    comments.

    Args:
        df: dataframe of comments to extract form letters from
        model: vectorize model for text embeddings
        form_threshold: threshold to consider a comment a form letter

    Returns:
        List of form letters, number of unique comments
    """
    # TODO clean strings
    num_form_letters = 0
    form_letters = []
    docs = df["comment_text"].to_numpy()

    if len(docs) <= 1:  # cannot cluster with less than 2 documents
        return form_letters, num_form_letters

    embeds = model.encode(docs, convert_to_numpy=True)
    clusters = compute_similiarity_clusters(embeds, sim_threshold=0.025)
    document_id = df.unique(subset="document_id").select("document_id").item()

    num_form_letters += clusters.max() + 1
    for cluster in range(num_form_letters):
        cluster_docs = docs[np.where(clusters == cluster)]
        if cluster_docs.size == 0:
            continue

        num_rep = (
            df.filter(pl.col("comment_text").is_in(cluster_docs))
            .select("comments_represented")
            .sum()
            .item()
        )
        letter_text = np.random.choice(cluster_docs, size=1).item()
        letter_id = (
            df.filter(pl.col("comment_text") == letter_text)
            .select("comment_id")
            .item()
        )

        form_letter = True
        if num_rep <= form_threshold:
            form_letter = False

        form_letters.append(
            {
                "comments_represented": num_rep,
                "comment_id": letter_id,
                "document_id": document_id,
                "comment_text": letter_text,
                "form_letter": form_letter,
            }
        )

    return form_letters, num_form_letters

`get_clusters(G)` ¶

Defines clusters based on the Louvain Communities algorithm

Parameters:

Name	Type	Description	Default
`G`	`Graph`	network graph with comments as nodes and their similarities as weights	required

Returns:

Type	Description
`list[set[int]]`	list[set[int]]: sets are clusters of comment nodes

Source code in civiclens/nlp/comments.py

def get_clusters(G: nx.Graph) -> list[set[int]]:
    """Defines clusters based on the Louvain Communities algorithm

    Args:
        G (nx.Graph): network graph with comments as nodes and their
            similarities as weights

    Returns:
        list[set[int]]: sets are clusters of comment nodes
    """
    return louvain_communities(G=G)

`get_doc_comments(id)` ¶

Pulls all comments for a set of documents and preprocesses that into a polars dataframe

Parameters:

Name	Type	Description	Default
`id`	`int`	document id	required

Returns:

Type	Description
`DataFrame`	pl.DataFrame: formated polars df

Source code in civiclens/nlp/comments.py

def get_doc_comments(id: str) -> pl.DataFrame:
    """Pulls all comments for a set of documents and preprocesses that into a
    polars dataframe

    Args:
        id (int): document id

    Returns:
        pl.DataFrame: formated polars df
    """
    query = f"""
        SELECT id, document_id, comment
        FROM regulations_comment
        WHERE document_id = '{id}';
        """  # noqa: E702, E231, E241
    # filter out attached files
    db = Database()
    df = pull_data(
        query=query, connection=db, schema=["id", "document_id", "comment"]
    )
    pattern = (
        r"(?i)^see attached file(s)?\.?$"
        r"|(?i)^please see attached?\.?$"
        r"|(?i)^see attached?\.?"
        r"|(?i)^see attached file\(s\)\.?$"
    )

    filtered_df = df.filter(~pl.col("comment").str.contains(pattern))

    # TODO create clusters column in comment table and delete these lines
    rows = filtered_df.shape[0]
    filtered_df = filtered_df.with_columns(
        pl.Series("cluster", [None] * rows).cast(pl.Utf8)
    )
    return filtered_df

`rep_comment_analysis(comment_data, df, model)` ¶

Runs all representative comment code for a document

Parameters:

Name	Type	Description	Default
`comment_data`	`RepComment`	empty RepComment object	required
`df`	`dataframe`	dataframe of comments pertaining to a document	required
`model`	`SentenceTransformer`	SBERT model for embeddings	required

Returns:

Name	Type	Description
`RepComment`	`RepComments`	dataclass with comment data

Source code in civiclens/nlp/comments.py

def rep_comment_analysis(
    comment_data: RepComments, df: pl.DataFrame, model: SentenceTransformer
) -> RepComments:
    """Runs all representative comment code for a document

    Args:
        comment_data (RepComment): empty RepComment object
        df (dataframe): dataframe of comments pertaining to a document
        model (SentenceTransformer): SBERT model for embeddings

    Returns:
        RepComment: dataclass with comment data
    """
    df_paraphrases, df_form_letter = comment_similarity(df, model)

    try:
        G_paraphrase = build_graph(df_paraphrases)
        clusters_paraphrase = get_clusters(G=G_paraphrase)
        df = assign_clusters(df=df, clusters=clusters_paraphrase)
        df_rep_paraphrase = representative_comments(
            G_paraphrase, clusters_paraphrase, df, form_letter=False
        ).sort(pl.col("comments_represented"), descending=True)
    except ZeroDivisionError:
        print("Paraphrase Clustering Not Possible: Empty DataFrame")

    try:
        G_form_letter = build_graph(df_form_letter)
        clusters_form_letter = get_clusters(G=G_form_letter)
        df = assign_clusters(df=df, clusters=clusters_form_letter)
        df_rep_form = representative_comments(
            G_form_letter,
            clusters_form_letter,
            df,
            form_letter=True,
        ).sort(pl.col("comments_represented"), descending=True)
    except ZeroDivisionError:
        print("Form Letter Clustering Not Possible: Empty DataFrame")

    # fill out comment class
    comment_data.doc_comments = df
    form_letters, num_form_letters = find_form_letters(
        df_rep_form, model, form_threshold=10
    )

    if df_rep_form.is_empty():
        comment_data.rep_comments = df_rep_paraphrase.to_dicts()
        comment_data.num_representative_comment = df_rep_paraphrase.shape[0]
    elif df_rep_paraphrase.is_empty():
        comment_data.rep_comments = form_letters
        comment_data.num_representative_comment = num_form_letters
    else:
        comment_data.rep_comments = form_letters + df_rep_paraphrase.to_dicts()
        comment_data.num_representative_comment = len(comment_data.rep_comments)

    num_paraphrased = count_unique_comments(df_paraphrases)
    comment_data.num_total_comments = df.shape[0]
    comment_data.num_unique_comments = (
        num_paraphrased + num_form_letters
        if num_paraphrased < comment_data.num_total_comments
        else num_paraphrased
    )

    return comment_data

`representative_comments(G, clusters, df, form_letter)` ¶

Creates a dataframe with the text of the representative comments along with the number of comments that are semantically represented by that text

Parameters:

Name	Type	Description	Default
`G`	`Graph`	network graph with comments as nodes and their similarities as weights	required
`clusters`	`list[set[int]]`	clusters from Louvain Communities	required
`df`	`DataFrame`	df from initial pull with added cluster info	required

Returns:

Name	Type	Description
`output_df`	`DataFrame`	df with representation information

Source code in civiclens/nlp/comments.py

def representative_comments(
    G: nx.Graph, clusters: list[set[int]], df: pl.DataFrame, form_letter: bool
) -> pl.DataFrame:
    """Creates a dataframe with the text of the representative comments along
    with the number of comments that are semantically represented by that text

    Args:
        G (nx.Graph): network graph with comments as nodes and their
            similarities as weights
        clusters (list[set[int]]): clusters from Louvain Communities
        df (pl.DataFrame): df from initial pull with added cluster info

    Returns:
        output_df (pl.DataFrame): df with representation information
    """
    central_nodes = find_central_node(G, clusters)
    representative_dict = {
        "comments_represented": [],
        "comment_id": [],
        "document_id": [],
        "comment_text": [],
        "form_letter": [],
    }
    for i, community in enumerate(clusters):
        community_size = len(community)
        central_node = list(central_nodes.keys())[i]
        representative_dict.get("comments_represented").append(community_size)
        representative_dict.get("comment_id").append(df[central_node, 0])
        representative_dict.get("document_id").append(df[central_node, 1])
        representative_dict.get("comment_text").append(df[central_node, 2])
        representative_dict.get("form_letter").append(form_letter)

    output_df = pl.DataFrame(representative_dict)

    if form_letter:
        return output_df.unique(subset=["comment_text"])
    else:
        return output_df

`titles` ¶

`TitleChain` ¶

Creates more accessible titles for regulation documnents

Source code in civiclens/nlp/titles.py

class TitleChain:
    """Creates more accessible titles for regulation documnents"""

    def __init__(self) -> None:
        self.template = """You are a title generator that is given a paragraph
        summary of a regulation. You job is to create a title that conveys the
        content of the paragraph summary in a succinct way that highlights the
        content that would be relevant to someone who is civically engaged and
        looking to find interesting regulations to comment on.

            Regulation Summary: {paragraph}

            Answer:"""
        self.prompt = PromptTemplate.from_template(self.template)
        self.model = title_model
        self.tokenizer = title_tokenizer
        self.pipe = pipeline(
            "text2text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            max_length=20,
        )
        self.hf_pipeline = HuggingFacePipeline(pipeline=self.pipe)
        self.parse = StrOutputParser()
        self.chain = self.prompt | self.hf_pipeline | self.parse

    def invoke(self, paragraph: str) -> str:
        return self.chain.invoke({"paragraph": paragraph})

`get_doc_summary(id)` ¶

Gets the id and summary for a given document

Parameters:

Name	Type	Description	Default
`id`	`int`	document id	required

Returns:

Type	Description
`DataFrame`	pl.DataFrame: formatted polars df

Source code in civiclens/nlp/titles.py

def get_doc_summary(id: str) -> pl.DataFrame:
    """Gets the id and summary for a given document

    Args:
        id (int): document id

    Returns:
        pl.DataFrame: formatted polars df
    """
    db = Database()
    query = f"""
            SELECT id, summary
            FROM regulations_document
            WHERE id = '{id}'
            """
    schema = ["id", "summary"]
    return pull_data(query=query, connection=db, schema=schema)

`tools` ¶

`Comment` ¶

Source code in civiclens/nlp/tools.py

@dataclass
class Comment:
    text: str = ""
    num_represented: int = 1
    id: str = str(uuid4())
    topic_label: str = ""
    topic: list[str] = None
    form_letter: bool = False
    sentiment: str = ""
    source: str = "Comment"
    representative: bool = False

    def to_dict(self):
        """
        Converts comment object to dictionary.
        """
        return {
            "text": self.text,
            "num_represented": self.num_represented,
            "id": self.id,
            "topic_label": self.topic_label,
            "topic": self.topic,
            "form_letter": self.form_letter,
            "sentiment": self.sentiment,
            "source": self.source,
        }

`to_dict()` ¶

Converts comment object to dictionary.

Source code in civiclens/nlp/tools.py

def to_dict(self):
    """
    Converts comment object to dictionary.
    """
    return {
        "text": self.text,
        "num_represented": self.num_represented,
        "id": self.id,
        "topic_label": self.topic_label,
        "topic": self.topic,
        "form_letter": self.form_letter,
        "sentiment": self.sentiment,
        "source": self.source,
    }

`RepComments` ¶

Source code in civiclens/nlp/tools.py

@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class RepComments:
    # clustered df for topics
    document_id: str
    doc_comments: pl.DataFrame = Field(default=pl.DataFrame())

    # fields for nlp table
    rep_comments: list = Field(default=[])
    doc_plain_english_title: str = ""
    num_total_comments: int = 0
    num_unique_comments: int = 0
    num_representative_comment: int = 0
    topics: list = Field(default=[])
    last_updated: datetime = datetime.now()
    uuid: int = uuid4().int
    search_vector: list = Field(default=[])
    summary: str = ""
    representative: bool = True

    # test this!
    def get_nonrepresentative_comments(self):
        """
        Converts nonrepresentative comments to list of Comment objects.
        """
        rep_ids = set()
        for comment in self.rep_comments:
            if isinstance(comment, Comment):
                rep_ids.add(comment.id)
            else:
                rep_ids.add(comment["comment_id"])

        return [
            Comment(id=comment["id"], text=comment["comment"])
            for comment in self.doc_comments.to_dicts()
            if comment["id"] not in rep_ids
        ]

    def to_list(self):
        """
        Converts representative comments to list of Comment objects.
        """
        if not self.rep_comments:
            return []

        return [
            Comment(
                text=comment["comment_text"],
                num_represented=comment["comments_represented"],
                id=comment["comment_id"],
                form_letter=comment["form_letter"],
                representative=True,
            )
            for comment in self.rep_comments
        ]

`get_nonrepresentative_comments()` ¶

Converts nonrepresentative comments to list of Comment objects.

Source code in civiclens/nlp/tools.py

def get_nonrepresentative_comments(self):
    """
    Converts nonrepresentative comments to list of Comment objects.
    """
    rep_ids = set()
    for comment in self.rep_comments:
        if isinstance(comment, Comment):
            rep_ids.add(comment.id)
        else:
            rep_ids.add(comment["comment_id"])

    return [
        Comment(id=comment["id"], text=comment["comment"])
        for comment in self.doc_comments.to_dicts()
        if comment["id"] not in rep_ids
    ]

`to_list()` ¶

Converts representative comments to list of Comment objects.

Source code in civiclens/nlp/tools.py

def to_list(self):
    """
    Converts representative comments to list of Comment objects.
    """
    if not self.rep_comments:
        return []

    return [
        Comment(
            text=comment["comment_text"],
            num_represented=comment["comments_represented"],
            id=comment["comment_id"],
            form_letter=comment["form_letter"],
            representative=True,
        )
        for comment in self.rep_comments
    ]

`sentiment_analysis(comment, pipeline)` ¶

Analyze sentiment of a comment.

Parameters:

Name	Type	Description	Default
`comment`	`Comment`	Comment object	required
`pipeline`	`pipeline`	Hugging Face pipeline for conducting sentiment analysis	required

Returns:

Type	Description
`str`	Sentiment label as string (e.g 'postive', 'negative', 'neutral')

Source code in civiclens/nlp/tools.py

def sentiment_analysis(comment: Comment, pipeline: pipeline) -> str:
    """
    Analyze sentiment of a comment.

    Args:
        comment: Comment object
        pipeline: Hugging Face pipeline for conducting sentiment analysis

    Returns:
        Sentiment label as string (e.g 'postive', 'negative', 'neutral')
    """
    try:
        out = pipeline(comment.text)[0]
    except Exception as e:
        print(e)
        return ""

    out = pipeline(comment.text)[0]
    return out["label"]

`topics` ¶

`HDAModel` ¶

Peforms LDA topic modeling

Source code in civiclens/nlp/topics.py

class HDAModel:
    """
    Peforms LDA topic modeling
    """

    def __init__(self):
        self.model = None
        self.tokenizer = partial(regex_tokenize, pattern=r"\W+")
        self.stop_words = stopwords(
            Path(__file__).resolve().parent / "saved_models/stop_words.pickle"
        )
        self.terms = None

    def _process_text(
        self, comments: list[Comment]
    ) -> tuple[list[list[str]], dict[int, str]]:
        """
        Clean text and convert to tokens
        """
        docs = []
        document_ids = {}
        for idx, comment in enumerate(comments):
            docs.append(self.tokenizer(clean_text(comment.text).lower()))
            document_ids[idx] = comment.id

        # remove numbers, 2 character tokens, and stop words
        docs = [
            [
                token
                for token in doc
                if not token.isnumeric()
                and len(token) > 2
                and token not in self.stop_words
            ]
            for doc in docs
        ]

        return docs, document_ids

    def _create_corpus(
        self, docs: list[list[str]]
    ) -> tuple[Dictionary, list[tuple]]:
        """
        Converts tokens to corpus and corresponding dictionary
        """
        bigram_generator = Phrases(docs, min_count=10).freeze()

        for doc in docs:
            for token in bigram_generator[doc]:
                if "_" in token:
                    doc.append(token)

        token_dict = corpora.Dictionary(docs)
        corpus = [token_dict.doc2bow(doc) for doc in docs]

        return token_dict, corpus

    def run_model(self, comments: list[Comment]):
        """
        Runs HDA topic analysis.
        """
        docs, document_id = self._process_text(comments)
        token_dict, corpus = self._create_corpus(docs)

        hdp_model = HdpModel(corpus, token_dict)
        numeric_topics = self._find_best_topic(hdp_model, corpus)

        comment_topics = {}
        topic_terms = {}
        for doc_id, topic in numeric_topics.items():
            comment_id = document_id[doc_id]
            if topic not in topic_terms:
                topic_terms[topic] = [
                    word for word, _ in hdp_model.show_topic(topic)
                ]
            comment_topics[comment_id] = topic

        self.terms = topic_terms

        return comment_topics

    def _find_best_topic(
        self, model: HdpModel, corpus: list[tuple]
    ) -> dict[int, int]:
        """
        Computes most probable topic per document
        """
        best_topic = {}
        for doc_id, doc in enumerate(corpus):
            max_prob = float("-inf")
            topic_id = -1
            for topic_num, prob in model[doc]:
                if prob > max_prob:
                    max_prob = prob
                    topic_id = topic_num
            best_topic[doc_id] = topic_id

        return best_topic

    def get_terms(self) -> dict:
        """
        Returns terms for a all topics
        """
        if not self.terms:
            return {}

        return self.terms

    def generate_search_vector(self) -> list[str]:
        """
        Creates array of topics to use in Django serach model.
        """
        if not self.terms:
            raise RuntimeError(
                "Must run topic model before generating search vector"
            )

        search_vector = set()
        for term_list in self.terms.values():
            search_vector.update(term_list)

        return list(search_vector)

`generate_search_vector()` ¶

Creates array of topics to use in Django serach model.

Source code in civiclens/nlp/topics.py

def generate_search_vector(self) -> list[str]:
    """
    Creates array of topics to use in Django serach model.
    """
    if not self.terms:
        raise RuntimeError(
            "Must run topic model before generating search vector"
        )

    search_vector = set()
    for term_list in self.terms.values():
        search_vector.update(term_list)

    return list(search_vector)

`get_terms()` ¶

Returns terms for a all topics

Source code in civiclens/nlp/topics.py

def get_terms(self) -> dict:
    """
    Returns terms for a all topics
    """
    if not self.terms:
        return {}

    return self.terms

`run_model(comments)` ¶

Runs HDA topic analysis.

Source code in civiclens/nlp/topics.py

def run_model(self, comments: list[Comment]):
    """
    Runs HDA topic analysis.
    """
    docs, document_id = self._process_text(comments)
    token_dict, corpus = self._create_corpus(docs)

    hdp_model = HdpModel(corpus, token_dict)
    numeric_topics = self._find_best_topic(hdp_model, corpus)

    comment_topics = {}
    topic_terms = {}
    for doc_id, topic in numeric_topics.items():
        comment_id = document_id[doc_id]
        if topic not in topic_terms:
            topic_terms[topic] = [
                word for word, _ in hdp_model.show_topic(topic)
            ]
        comment_topics[comment_id] = topic

    self.terms = topic_terms

    return comment_topics

`LabelChain` ¶

Source code in civiclens/nlp/topics.py

class LabelChain:
    def __init__(self):
        self.tokenizer = label_tokenizer
        self.model = label_model

    def generate_label(self, terms: list[str]) -> tuple:
        """
        Create better topic terms.
        """
        text = ", ".join(terms)

        inputs = self.tokenizer(
            [text], max_length=512, truncation=True, return_tensors="pt"
        )
        output = self.model.generate(
            **inputs, num_beams=8, do_sample=True, min_length=10, max_length=64
        )

        decoded_output = self.tokenizer.batch_decode(
            output, skip_special_tokens=True
        )[0]

        return tuple(set(decoded_output.strip().split(", ")))

`generate_label(terms)` ¶

Create better topic terms.

Source code in civiclens/nlp/topics.py

def generate_label(self, terms: list[str]) -> tuple:
    """
    Create better topic terms.
    """
    text = ", ".join(terms)

    inputs = self.tokenizer(
        [text], max_length=512, truncation=True, return_tensors="pt"
    )
    output = self.model.generate(
        **inputs, num_beams=8, do_sample=True, min_length=10, max_length=64
    )

    decoded_output = self.tokenizer.batch_decode(
        output, skip_special_tokens=True
    )[0]

    return tuple(set(decoded_output.strip().split(", ")))

`create_topics(comments)` ¶

Condense topics for document summary

Parameters:

Name	Type	Description	Default
`Comments`		list of Comment objects	required

Returns:

Type	Description
`dict`	Dictionary of topics, and corresponding sentiment data

Source code in civiclens/nlp/topics.py

def create_topics(comments: list[Comment]) -> dict:
    """
    Condense topics for document summary

    Args:
        Comments: list of Comment objects

    Returns:
        Dictionary of topics, and corresponding sentiment data
    """
    temp = defaultdict(dict)

    for comment in comments:
        temp[comment.topic_label][comment.sentiment] = (
            temp[comment.topic_label].get(comment.sentiment, 0)
            + comment.num_represented
        )
        temp[comment.topic_label]["total"] = (
            temp[comment.topic_label].get("total", 0) + comment.num_represented
        )

    topics = []
    # create output dictionary
    for topic_label, part in temp.items():
        part["topic"] = topic_label
        topics.append(part)

    # sort topics by "total"
    return sorted(topics, key=lambda topic: topic["total"], reverse=True)

`label_topics(topics, model)` ¶

Generates a label for all topics

Parameters:

Name	Type	Description	Default
`topics`	`dict[int, list]`	dictionary of topics, as lists of terms	required
`model`	`LabelChain`	LLM model to generate labels	required

Returns:

Type	Description
`dict[int, str]`	Dictionary of topics, and labels

Source code in civiclens/nlp/topics.py

def label_topics(topics: dict[int, list], model: LabelChain) -> dict[int, str]:
    """
    Generates a label for all topics

    Args:
        topics: dictionary of topics, as lists of terms
        model: LLM model to generate labels

    Returns:
        Dictionary of topics, and labels
    """
    labels = {}
    for topic, terms in topics.items():
        labels[topic] = model.generate_label(terms)

    return labels

`stopwords(model_path)` ¶

Loads in pickled set of stopword for text processing.

Parameters:

Name	Type	Description	Default
`model_path`	`Path`	path from downloaded model	required

Returns:

Type	Description
`set[str]`	Set of stop words.

Source code in civiclens/nlp/topics.py

def stopwords(model_path: Path) -> set[str]:
    """
    Loads in pickled set of stopword for text processing.

    Args:
        model_path: path from downloaded model

    Returns:
        Set of stop words.
    """
    with open(model_path, "rb") as f:
        stop_words = pickle.load(f)

    return stop_words

`topic_comment_analysis(comment_data, model=None, labeler=None, sentiment_analyzer=None)` ¶

Run topic and sentiment analysis.

Parameters:

Name	Type	Description	Default
`comment_data`	`RepComments`	RepComment object	required
`model`	`HDAModel`	instance topic model class	`None`
`labeler`	`LabelChain`	chain for generating topic labels	`None`
`sentiment_analyzer`	`Callable`	function to analyze comment text sentiment	`None`

Returns:

Type	Description
`RepComments`	RepComment object with full topic anlayis complete

Source code in civiclens/nlp/topics.py

def topic_comment_analysis(
    comment_data: RepComments,
    model: HDAModel = None,
    labeler: LabelChain = None,
    sentiment_analyzer: Callable = None,
) -> RepComments:
    """
    Run topic and sentiment analysis.

    Args:
        comment_data: RepComment object
        model: instance topic model class
        labeler: chain for generating topic labels
        sentiment_analyzer: function to analyze comment text sentiment

    Returns:
        RepComment object with full topic anlayis complete
    """
    comments: list[Comment] = []

    if comment_data.summary:
        comments += [
            Comment(text=comment_data.summary, id="Summary", source="Summary")
        ]

    comments += comment_data.to_list()
    if not comment_data.rep_comments:
        comment_data.representative = False
        comments += comment_data.get_nonrepresentative_comments()

    comment_topics = model.run_model(comments)
    topic_terms = model.get_terms()
    topic_labels = label_topics(topic_terms, labeler)

    # filter out non_rep comments
    rep_comments: list[Comment] = []

    for comment in comments:
        comment.topic_label = topic_labels[comment_topics[comment.id]]
        comment.topic = comment_topics[comment.id]
        comment.sentiment = sentiment_analyzer(comment)
        if comment.representative or not comment_data.representative:
            rep_comments.append(comment)

    rep_comments = sorted(
        rep_comments, key=lambda comment: comment.num_represented, reverse=True
    )

    return RepComments(
        document_id=comment_data.document_id,
        doc_comments=comment_data.doc_comments,
        rep_comments=[comment.to_dict() for comment in rep_comments],
        doc_plain_english_title=comment_data.doc_plain_english_title,
        num_total_comments=comment_data.num_total_comments,
        num_unique_comments=comment_data.num_unique_comments,
        num_representative_comment=comment_data.num_representative_comment,
        topics=create_topics(comments),
        search_vector=model.generate_search_vector(),
        representative=comment_data.representative,
    )

`text` ¶

`clean_text(text, patterns=None)` ¶

String cleaning function for comments.

Parameters:

Name	Type	Description	Default
`text`	`str`	comment text	required
`patterns`	`list[str]`	optional list of regular expression patterns to pass in (eg. [(r'\w+', "-")])	`None`

Returns:

Type	Description
`str`	Cleaned verison of text

Source code in civiclens/utils/text.py

def clean_text(text: str, patterns: Optional[list[tuple]] = None) -> str:
    r"""
    String cleaning function for comments.

    Args:
        text (str): comment text
        patterns (list[str]): optional list of regular expression patterns
            to pass in (eg. [(r'\w+', "-")])

    Returns:
        Cleaned verison of text
    """
    if patterns is None:
        patterns = []

    text = re.sub(r"&#39;", "'", text)  # this replaces html entity with '
    text = re.sub(r"&rdquo;", '"', text)  # this replaces html entity with "
    text = re.sub(r"&amp;", "&", text)  # this replaces html entity with &
    text = re.sub(r"â", "", text)
    text = re.sub(r"<br\s*/?>", "", text)

    text = re.sub(r"<\s*br\s*/>", " ", text)
    text = re.sub(r"[^a-zA-Z0-9.'\"\?\: -]", "", text)
    text = re.sub(r"\w*ndash\w*", "", text)

    if patterns:
        for pattern, replacement in patterns:
            text = re.sub(pattern, replacement, text)

    # remove extra whitespace
    return re.sub(r"\s+", " ", text).strip()

`regex_tokenize(text, pattern='\\W+')` ¶

Splits strings into tokens base on regular expression.

Parameters:

Name	Type	Description	Default
`text`	`str`	string to tokenize	required
`pattern`	`str`	regular expression to split tokens on, defaults to white space	`'\\W+'`

Returns:

Type	Description
	List of strings represented tokens

Source code in civiclens/utils/text.py

def regex_tokenize(text: str, pattern: str = r"\W+"):
    """
    Splits strings into tokens base on regular expression.

    Args:
        text: string to tokenize
        pattern: regular expression to split tokens on, defaults to white space

    Returns:
        List of strings represented tokens
    """
    return re.split(pattern, text)

`sentence_splitter(text, sep='.')` ¶

Splits string into sentences.

Parameters:

Name	Type	Description	Default
`text`	`str`	string to process	required
`sep`	`str`	value to seperate string on, defaults to '.'	`'.'`

Returns:

Type	Description
`list[str]`	List of strings split on the seperator valur

Source code in civiclens/utils/text.py

def sentence_splitter(text: str, sep: str = ".") -> list[str]:
    """
    Splits string into sentences.

    Args:
        text: string to process
        sep: value to seperate string on, defaults to '.'

    Returns:
        List of strings split on the seperator valur
    """
    # remove periods not at the end of sentences
    clean = re.sub(r"\.(?!\s)", " ", text)
    sentences = clean.split(sep)

    return [sentence.strip() + "." for sentence in sentences if sentence]

`truncate(text, num_words)` ¶

Truncates commments:

Parameters:

Name	Type	Description	Default
`text`	`str`	Text of the comment	required
`num_words`	`int`	Number of words to keep	required

Returns:

Type	Description
`str`	Truncated commented

Source code in civiclens/utils/text.py

def truncate(text: str, num_words: int) -> str:
    """
    Truncates commments:

    Args:
        text (str): Text of the comment
        num_words (int): Number of words to keep

    Returns:
        Truncated commented
    """
    words = text.split(" ")

    return " ".join(words[:num_words])

`database_access` ¶

`Database` ¶

Wrapper for CivicLens postrgres DB.

Source code in civiclens/utils/database_access.py

class Database:
    """
    Wrapper for CivicLens postrgres DB.
    """

    def __init__(self):
        self.conn = psycopg2.connect(
            database=os.getenv("DATABASE"),
            user=os.getenv("DATABASE_USER"),
            password=os.getenv("DATABASE_PASSWORD"),
            host=os.getenv("DATABASE_HOST"),
            port=os.getenv("DATABASE_PORT"),
        )

    def cursor(self):
        return self.conn.cursor()

    def close(self):
        return self.conn.close()

    def commit(self):
        return self.conn.commit()

`pull_data(connection, query, schema=None, return_type='df')` ¶

Takes a SQL Query and returns a polars dataframe

Parameters:

Name	Type	Description	Default
`query`	`str`	SQL Query	required
`schema`	`list[str]`	list of column names for the dataframe	`None`
`return_type`	`str`	"df" or "list"	`'df'`

Returns:

Type	Description
`DataFrame \| List[Tuple]`	Polars df of comment data or list of comment data

Source code in civiclens/utils/database_access.py

def pull_data(
    connection: Database,
    query: str,
    schema: Optional[List[str]] = None,
    return_type: str = "df",
) -> pl.DataFrame | List[Tuple]:
    """Takes a SQL Query and returns a polars dataframe

    Args:
        query (str): SQL Query
        schema (list[str]): list of column names for the dataframe
        return_type (str): "df" or "list"

    Returns:
        Polars df of comment data or list of comment data
    """ """"""
    if return_type == "df" and not schema:
        raise ValueError("Must input schema to return df")

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    except (Exception, psycopg2.Error) as error:
        raise RuntimeError(
            f"Error while connecting to PostgreSQL: {str(error).strip()}"
        ) from error

    finally:
        # Close the connection and cursor to free resources
        if connection:
            cursor.close()
            connection.close()
            print("PostgreSQL connection is closed")

    if return_type == "df":
        results = pl.DataFrame(results, schema=schema)

    return results

`upload_comments(connection, comments)` ¶

Uploads comment data to database.

Parameters:

Name	Type	Description	Default
`connection`	`Database`	Postgres client	required
`comments`	`RepComments`	comments to be uploaded	required

Returns:

Type	Description
`None`	None, uploads comments to database

Source code in civiclens/utils/database_access.py

def upload_comments(connection: Database, comments: RepComments) -> None:
    """
    Uploads comment data to database.

    Args:
        connection: Postgres client
        comments: comments to be uploaded

    Returns:
        None, uploads comments to database
    """
    query = """
    INSERT INTO regulations_nlpoutput (
            comments,
            is_representative,
            doc_plain_english_title,
            num_total_comments,
            num_unique_comments,
            num_representative_comment,
            topics,
            num_topics,
            last_updated,
            created_at,
            search_topics,
            document_id)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (document_id)
        DO UPDATE SET
            comments = EXCLUDED.comments,
            is_representative = EXCLUDED.is_representative,
            doc_plain_english_title = EXCLUDED.doc_plain_english_title,
            num_total_comments = EXCLUDED.num_total_comments,
            num_unique_comments = EXCLUDED.num_unique_comments,
            num_representative_comment = EXCLUDED.num_representative_comment,
            topics = EXCLUDED.topics,
            num_topics = EXCLUDED.num_topics,
            last_updated = NOW(),
            search_topics = EXCLUDED.search_topics
        WHERE regulations_nlpoutput.last_updated IS NULL
            OR regulations_nlpoutput.last_updated < EXCLUDED.last_updated;
            """

    values = (
        json.dumps(comments.rep_comments),
        comments.representative,
        comments.doc_plain_english_title,
        comments.num_total_comments,
        comments.num_unique_comments,
        comments.num_representative_comment,
        json.dumps(comments.topics),
        len(comments.topics),
        datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
        comments.last_updated.strftime("%m/%d/%Y, %H:%M:%S"),
        ", ".join(comments.search_vector),
        comments.document_id,
    )

    try:
        cursor = connection.cursor()
        cursor.execute(query, values)
        connection.commit()

    except Exception as e:
        print(e)

    if connection:
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")

nlp¶

Reference¶

comments ¶

assign_clusters(df, clusters) ¶

build_graph(df) ¶

comment_similarity(df, model) ¶

compute_similiarity_clusters(embeds, sim_threshold) ¶

count_unique_comments(df) ¶

find_central_node(G, clusters) ¶

find_form_letters(df, model, form_threshold) ¶

get_clusters(G) ¶

get_doc_comments(id) ¶

rep_comment_analysis(comment_data, df, model) ¶

representative_comments(G, clusters, df, form_letter) ¶

titles ¶

TitleChain ¶

get_doc_summary(id) ¶

tools ¶

Comment ¶

to_dict() ¶

RepComments ¶

get_nonrepresentative_comments() ¶

to_list() ¶

sentiment_analysis(comment, pipeline) ¶

topics ¶

HDAModel ¶

generate_search_vector() ¶

get_terms() ¶

run_model(comments) ¶

LabelChain ¶

generate_label(terms) ¶

create_topics(comments) ¶

label_topics(topics, model) ¶

stopwords(model_path) ¶

topic_comment_analysis(comment_data, model=None, labeler=None, sentiment_analyzer=None) ¶

text ¶

clean_text(text, patterns=None) ¶

regex_tokenize(text, pattern='\\W+') ¶

sentence_splitter(text, sep='.') ¶

truncate(text, num_words) ¶

database_access ¶

Database ¶

pull_data(connection, query, schema=None, return_type='df') ¶

upload_comments(connection, comments) ¶

`comments` ¶

`assign_clusters(df, clusters)` ¶

`build_graph(df)` ¶

`comment_similarity(df, model)` ¶

`compute_similiarity_clusters(embeds, sim_threshold)` ¶

`count_unique_comments(df)` ¶

`find_central_node(G, clusters)` ¶

`find_form_letters(df, model, form_threshold)` ¶

`get_clusters(G)` ¶

`get_doc_comments(id)` ¶

`rep_comment_analysis(comment_data, df, model)` ¶

`representative_comments(G, clusters, df, form_letter)` ¶

`titles` ¶

`TitleChain` ¶

`get_doc_summary(id)` ¶

`tools` ¶

`Comment` ¶

`to_dict()` ¶

`RepComments` ¶

`get_nonrepresentative_comments()` ¶

`to_list()` ¶

`sentiment_analysis(comment, pipeline)` ¶

`topics` ¶

`HDAModel` ¶

`generate_search_vector()` ¶

`get_terms()` ¶

`run_model(comments)` ¶

`LabelChain` ¶

`generate_label(terms)` ¶

`create_topics(comments)` ¶

`label_topics(topics, model)` ¶

`stopwords(model_path)` ¶

`topic_comment_analysis(comment_data, model=None, labeler=None, sentiment_analyzer=None)` ¶

`text` ¶

`clean_text(text, patterns=None)` ¶

`regex_tokenize(text, pattern='\\W+')` ¶

`sentence_splitter(text, sep='.')` ¶

`truncate(text, num_words)` ¶

`database_access` ¶

`Database` ¶

`pull_data(connection, query, schema=None, return_type='df')` ¶

`upload_comments(connection, comments)` ¶