Skip to content

nlp

API reference for CivicLen's natural language processing toolkit.

Reference

comments

assign_clusters(df, clusters)

Inserts cluster info into the polars df of data from the initial pull

Parameters:

Name Type Description Default
df DataFrame

df from initial pull

required
clusters list[set[int]]

clusters from Louvain Communities

required

Returns:

Type Description
DataFrame

pl.DataFrame: updated df

Source code in civiclens/nlp/comments.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def assign_clusters(df: pl.DataFrame, clusters: list[set[int]]) -> pl.DataFrame:
    """Inserts cluster info into the polars df of data from the initial pull

    Args:
        df (pl.DataFrame): df from initial pull
        clusters (list[set[int]]): clusters from Louvain Communities

    Returns:
        pl.DataFrame: updated df
    """
    rows = df.shape[0]
    # go through clusters and add that info to df
    for i, cluster_data in enumerate(clusters):
        df = df.with_columns(
            pl.when(pl.arange(0, rows).is_in(cluster_data))
            .then(i)
            .otherwise(pl.col("cluster"))
            .alias("cluster")
        )

    return df

build_graph(df)

Builds a network graph with comments as nodes and their similarities as weights

Parameters:

Name Type Description Default
df DataFrame

df with pairs of comment indices and a cosine similarity

required

Returns:

Type Description
Graph

nx.Graph:network graph with comments as nodes and their similarities as weights

Source code in civiclens/nlp/comments.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def build_graph(df: pl.DataFrame) -> nx.Graph:
    """Builds a network graph with comments as nodes and their similarities as
    weights

    Args:
        df (pl.DataFrame): df with pairs of comment indices and a cosine
            similarity

    Returns:
        nx.Graph:network graph with comments as nodes and their similarities as
            weights
    """
    graph_data = df.to_dicts()
    G = nx.Graph()
    for edge in graph_data:
        G.add_edge(edge["idx1"], edge["idx2"], weight=edge["similarity"])

    return G

comment_similarity(df, model)

Create df with comment mappings and their semantic similarity scores according to the SBERT paraphrase mining method using the all-mpnet-base-v2 model from hugging face.

Parameters:

Name Type Description Default
df DataFrame

df with comment data

required
model SentenceTransformer

sbert sentence transformer model

required

Returns:

Type Description
DataFrame

df_paraphrase, df_form_letter (tuple[pl.DataFrame]): cosine similarities for form letters and non form letters

Source code in civiclens/nlp/comments.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def comment_similarity(
    df: pl.DataFrame, model: SentenceTransformer
) -> pl.DataFrame:
    """Create df with comment mappings and their semantic similarity scores
    according to the SBERT paraphrase mining method using the all-mpnet-base-v2
    model from hugging face.

    Args:
        df (pl.DataFrame): df with comment data
        model (SentenceTransformer): sbert sentence transformer model

    Returns:
        df_paraphrase, df_form_letter (tuple[pl.DataFrame]): cosine
            similarities for form letters and non form letters
    """
    paraphrases = util.paraphrase_mining(
        model, df["comment"].to_list(), show_progress_bar=True
    )
    df_full = pl.DataFrame(
        {
            "similarity": pl.Series(
                "similarity", [x[0] for x in paraphrases], dtype=pl.Float64
            ),
            "idx1": pl.Series(
                "idx1", [x[1] for x in paraphrases], dtype=pl.Int64
            ),
            "idx2": pl.Series(
                "idx2", [x[2] for x in paraphrases], dtype=pl.Int64
            ),
        }
    )

    df_paraphrases = df_full.filter(pl.col("similarity") <= 0.99)
    df_paraphrases = df_paraphrases.with_columns(
        pl.lit(False).alias("form_letter")
    )

    df_form_letter = df_full.filter(pl.col("similarity") > 0.99)
    df_form_letter = df_form_letter.with_columns(
        pl.lit(True).alias("form_letter")
    )

    return df_paraphrases, df_form_letter

compute_similiarity_clusters(embeds, sim_threshold)

Extract form letters from corpus of comments.

Parameters:

Name Type Description Default
embeds ndarray

array of embeddings representing the documents

required
sim_threshold float

distance thresholds to divide clusters

required

Returns:

Type Description
ndarray

Array of docs by cluster

Source code in civiclens/nlp/comments.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def compute_similiarity_clusters(
    embeds: np.ndarray, sim_threshold: float
) -> np.ndarray:
    """
    Extract form letters from corpus of comments.

    Args:
        embeds: array of embeddings representing the documents
        sim_threshold: distance thresholds to divide clusters

    Returns:
        Array of docs by cluster
    """
    kmeans = AgglomerativeClustering(
        n_clusters=None,
        metric="cosine",
        linkage="average",
        distance_threshold=sim_threshold,
    )
    kmeans.fit(embeds)

    return kmeans.labels_

count_unique_comments(df)

Counts number of unique comments identified by performing paraphrasing mining on a corpus of comments.

Parameters:

Name Type Description Default
df DataFrame

dataframe of similiar comments

required
Source code in civiclens/nlp/comments.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def count_unique_comments(df: pl.DataFrame) -> int:
    """
    Counts number of unique comments identified by performing paraphrasing
    mining on a corpus of comments.

    Args:
        df: dataframe of similiar comments
    """
    indices = df["idx1"].to_list() + df["idx2"].to_list()

    return len(set(indices))

find_central_node(G, clusters)

Find the most representative comment in a cluster by identifying the most central node

Parameters:

Name Type Description Default
G Graph

network graph with comments as nodes and their similarities as weights

required
clusters list[set[int]]

clusters from Louvain Communities

required

Returns:

Name Type Description
dict dict

dictionary with the central comment id as the key and the degree centrality as the value

Source code in civiclens/nlp/comments.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def find_central_node(G: nx.Graph, clusters: list[set[int]]) -> dict:
    """Find the most representative comment in a cluster by identifying the
    most central node

    Args:
        G (nx.Graph): network graph with comments as nodes and their
            similarities as weights
        clusters (list[set[int]]): clusters from Louvain Communities

    Returns:
        dict: dictionary with the central comment id as the key and the degree
            centrality as the value
    """
    centrality_per_cluster = {}
    for cluster in clusters:
        # focus on each specific cluster of comments
        subgraph = G.subgraph(cluster)
        # calculate the centrality of each comment in the cluster
        centralities = nx.degree_centrality(subgraph)
        # Find the node with the highest centrality
        central_node = max(centralities, key=centralities.get)
        centrality_per_cluster[central_node] = centralities[central_node]

    return centrality_per_cluster

find_form_letters(df, model, form_threshold)

Finds and extracts from letters by clustering, counts number of unique comments.

Parameters:

Name Type Description Default
df DataFrame

dataframe of comments to extract form letters from

required
model SentenceTransformer

vectorize model for text embeddings

required
form_threshold int

threshold to consider a comment a form letter

required

Returns:

Type Description
tuple[list[dict], int]

List of form letters, number of unique comments

Source code in civiclens/nlp/comments.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
def find_form_letters(
    df: pl.DataFrame, model: SentenceTransformer, form_threshold: int
) -> tuple[list[dict], int]:
    """
    Finds and extracts from letters by clustering, counts number of unique
    comments.

    Args:
        df: dataframe of comments to extract form letters from
        model: vectorize model for text embeddings
        form_threshold: threshold to consider a comment a form letter

    Returns:
        List of form letters, number of unique comments
    """
    # TODO clean strings
    num_form_letters = 0
    form_letters = []
    docs = df["comment_text"].to_numpy()

    if len(docs) <= 1:  # cannot cluster with less than 2 documents
        return form_letters, num_form_letters

    embeds = model.encode(docs, convert_to_numpy=True)
    clusters = compute_similiarity_clusters(embeds, sim_threshold=0.025)
    document_id = df.unique(subset="document_id").select("document_id").item()

    num_form_letters += clusters.max() + 1
    for cluster in range(num_form_letters):
        cluster_docs = docs[np.where(clusters == cluster)]
        if cluster_docs.size == 0:
            continue

        num_rep = (
            df.filter(pl.col("comment_text").is_in(cluster_docs))
            .select("comments_represented")
            .sum()
            .item()
        )
        letter_text = np.random.choice(cluster_docs, size=1).item()
        letter_id = (
            df.filter(pl.col("comment_text") == letter_text)
            .select("comment_id")
            .item()
        )

        form_letter = True
        if num_rep <= form_threshold:
            form_letter = False

        form_letters.append(
            {
                "comments_represented": num_rep,
                "comment_id": letter_id,
                "document_id": document_id,
                "comment_text": letter_text,
                "form_letter": form_letter,
            }
        )

    return form_letters, num_form_letters

get_clusters(G)

Defines clusters based on the Louvain Communities algorithm

Parameters:

Name Type Description Default
G Graph

network graph with comments as nodes and their similarities as weights

required

Returns:

Type Description
list[set[int]]

list[set[int]]: sets are clusters of comment nodes

Source code in civiclens/nlp/comments.py
127
128
129
130
131
132
133
134
135
136
137
def get_clusters(G: nx.Graph) -> list[set[int]]:
    """Defines clusters based on the Louvain Communities algorithm

    Args:
        G (nx.Graph): network graph with comments as nodes and their
            similarities as weights

    Returns:
        list[set[int]]: sets are clusters of comment nodes
    """
    return louvain_communities(G=G)

get_doc_comments(id)

Pulls all comments for a set of documents and preprocesses that into a polars dataframe

Parameters:

Name Type Description Default
id int

document id

required

Returns:

Type Description
DataFrame

pl.DataFrame: formated polars df

Source code in civiclens/nlp/comments.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def get_doc_comments(id: str) -> pl.DataFrame:
    """Pulls all comments for a set of documents and preprocesses that into a
    polars dataframe

    Args:
        id (int): document id

    Returns:
        pl.DataFrame: formated polars df
    """
    query = f"""
        SELECT id, document_id, comment
        FROM regulations_comment
        WHERE document_id = '{id}';
        """  # noqa: E702, E231, E241
    # filter out attached files
    db = Database()
    df = pull_data(
        query=query, connection=db, schema=["id", "document_id", "comment"]
    )
    pattern = (
        r"(?i)^see attached file(s)?\.?$"
        r"|(?i)^please see attached?\.?$"
        r"|(?i)^see attached?\.?"
        r"|(?i)^see attached file\(s\)\.?$"
    )

    filtered_df = df.filter(~pl.col("comment").str.contains(pattern))

    # TODO create clusters column in comment table and delete these lines
    rows = filtered_df.shape[0]
    filtered_df = filtered_df.with_columns(
        pl.Series("cluster", [None] * rows).cast(pl.Utf8)
    )
    return filtered_df

rep_comment_analysis(comment_data, df, model)

Runs all representative comment code for a document

Parameters:

Name Type Description Default
comment_data RepComment

empty RepComment object

required
df dataframe

dataframe of comments pertaining to a document

required
model SentenceTransformer

SBERT model for embeddings

required

Returns:

Name Type Description
RepComment RepComments

dataclass with comment data

Source code in civiclens/nlp/comments.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
def rep_comment_analysis(
    comment_data: RepComments, df: pl.DataFrame, model: SentenceTransformer
) -> RepComments:
    """Runs all representative comment code for a document

    Args:
        comment_data (RepComment): empty RepComment object
        df (dataframe): dataframe of comments pertaining to a document
        model (SentenceTransformer): SBERT model for embeddings

    Returns:
        RepComment: dataclass with comment data
    """
    df_paraphrases, df_form_letter = comment_similarity(df, model)

    try:
        G_paraphrase = build_graph(df_paraphrases)
        clusters_paraphrase = get_clusters(G=G_paraphrase)
        df = assign_clusters(df=df, clusters=clusters_paraphrase)
        df_rep_paraphrase = representative_comments(
            G_paraphrase, clusters_paraphrase, df, form_letter=False
        ).sort(pl.col("comments_represented"), descending=True)
    except ZeroDivisionError:
        print("Paraphrase Clustering Not Possible: Empty DataFrame")

    try:
        G_form_letter = build_graph(df_form_letter)
        clusters_form_letter = get_clusters(G=G_form_letter)
        df = assign_clusters(df=df, clusters=clusters_form_letter)
        df_rep_form = representative_comments(
            G_form_letter,
            clusters_form_letter,
            df,
            form_letter=True,
        ).sort(pl.col("comments_represented"), descending=True)
    except ZeroDivisionError:
        print("Form Letter Clustering Not Possible: Empty DataFrame")

    # fill out comment class
    comment_data.doc_comments = df
    form_letters, num_form_letters = find_form_letters(
        df_rep_form, model, form_threshold=10
    )

    if df_rep_form.is_empty():
        comment_data.rep_comments = df_rep_paraphrase.to_dicts()
        comment_data.num_representative_comment = df_rep_paraphrase.shape[0]
    elif df_rep_paraphrase.is_empty():
        comment_data.rep_comments = form_letters
        comment_data.num_representative_comment = num_form_letters
    else:
        comment_data.rep_comments = form_letters + df_rep_paraphrase.to_dicts()
        comment_data.num_representative_comment = len(comment_data.rep_comments)

    num_paraphrased = count_unique_comments(df_paraphrases)
    comment_data.num_total_comments = df.shape[0]
    comment_data.num_unique_comments = (
        num_paraphrased + num_form_letters
        if num_paraphrased < comment_data.num_total_comments
        else num_paraphrased
    )

    return comment_data

representative_comments(G, clusters, df, form_letter)

Creates a dataframe with the text of the representative comments along with the number of comments that are semantically represented by that text

Parameters:

Name Type Description Default
G Graph

network graph with comments as nodes and their similarities as weights

required
clusters list[set[int]]

clusters from Louvain Communities

required
df DataFrame

df from initial pull with added cluster info

required

Returns:

Name Type Description
output_df DataFrame

df with representation information

Source code in civiclens/nlp/comments.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def representative_comments(
    G: nx.Graph, clusters: list[set[int]], df: pl.DataFrame, form_letter: bool
) -> pl.DataFrame:
    """Creates a dataframe with the text of the representative comments along
    with the number of comments that are semantically represented by that text

    Args:
        G (nx.Graph): network graph with comments as nodes and their
            similarities as weights
        clusters (list[set[int]]): clusters from Louvain Communities
        df (pl.DataFrame): df from initial pull with added cluster info

    Returns:
        output_df (pl.DataFrame): df with representation information
    """
    central_nodes = find_central_node(G, clusters)
    representative_dict = {
        "comments_represented": [],
        "comment_id": [],
        "document_id": [],
        "comment_text": [],
        "form_letter": [],
    }
    for i, community in enumerate(clusters):
        community_size = len(community)
        central_node = list(central_nodes.keys())[i]
        representative_dict.get("comments_represented").append(community_size)
        representative_dict.get("comment_id").append(df[central_node, 0])
        representative_dict.get("document_id").append(df[central_node, 1])
        representative_dict.get("comment_text").append(df[central_node, 2])
        representative_dict.get("form_letter").append(form_letter)

    output_df = pl.DataFrame(representative_dict)

    if form_letter:
        return output_df.unique(subset=["comment_text"])
    else:
        return output_df

titles

TitleChain

Creates more accessible titles for regulation documnents

Source code in civiclens/nlp/titles.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class TitleChain:
    """Creates more accessible titles for regulation documnents"""

    def __init__(self) -> None:
        self.template = """You are a title generator that is given a paragraph
        summary of a regulation. You job is to create a title that conveys the
        content of the paragraph summary in a succinct way that highlights the
        content that would be relevant to someone who is civically engaged and
        looking to find interesting regulations to comment on.

            Regulation Summary: {paragraph}

            Answer:"""
        self.prompt = PromptTemplate.from_template(self.template)
        self.model = title_model
        self.tokenizer = title_tokenizer
        self.pipe = pipeline(
            "text2text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            max_length=20,
        )
        self.hf_pipeline = HuggingFacePipeline(pipeline=self.pipe)
        self.parse = StrOutputParser()
        self.chain = self.prompt | self.hf_pipeline | self.parse

    def invoke(self, paragraph: str) -> str:
        return self.chain.invoke({"paragraph": paragraph})

get_doc_summary(id)

Gets the id and summary for a given document

Parameters:

Name Type Description Default
id int

document id

required

Returns:

Type Description
DataFrame

pl.DataFrame: formatted polars df

Source code in civiclens/nlp/titles.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def get_doc_summary(id: str) -> pl.DataFrame:
    """Gets the id and summary for a given document

    Args:
        id (int): document id

    Returns:
        pl.DataFrame: formatted polars df
    """
    db = Database()
    query = f"""
            SELECT id, summary
            FROM regulations_document
            WHERE id = '{id}'
            """
    schema = ["id", "summary"]
    return pull_data(query=query, connection=db, schema=schema)

tools

Comment

Source code in civiclens/nlp/tools.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
@dataclass
class Comment:
    text: str = ""
    num_represented: int = 1
    id: str = str(uuid4())
    topic_label: str = ""
    topic: list[str] = None
    form_letter: bool = False
    sentiment: str = ""
    source: str = "Comment"
    representative: bool = False

    def to_dict(self):
        """
        Converts comment object to dictionary.
        """
        return {
            "text": self.text,
            "num_represented": self.num_represented,
            "id": self.id,
            "topic_label": self.topic_label,
            "topic": self.topic,
            "form_letter": self.form_letter,
            "sentiment": self.sentiment,
            "source": self.source,
        }

to_dict()

Converts comment object to dictionary.

Source code in civiclens/nlp/tools.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def to_dict(self):
    """
    Converts comment object to dictionary.
    """
    return {
        "text": self.text,
        "num_represented": self.num_represented,
        "id": self.id,
        "topic_label": self.topic_label,
        "topic": self.topic,
        "form_letter": self.form_letter,
        "sentiment": self.sentiment,
        "source": self.source,
    }

RepComments

Source code in civiclens/nlp/tools.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class RepComments:
    # clustered df for topics
    document_id: str
    doc_comments: pl.DataFrame = Field(default=pl.DataFrame())

    # fields for nlp table
    rep_comments: list = Field(default=[])
    doc_plain_english_title: str = ""
    num_total_comments: int = 0
    num_unique_comments: int = 0
    num_representative_comment: int = 0
    topics: list = Field(default=[])
    last_updated: datetime = datetime.now()
    uuid: int = uuid4().int
    search_vector: list = Field(default=[])
    summary: str = ""
    representative: bool = True

    # test this!
    def get_nonrepresentative_comments(self):
        """
        Converts nonrepresentative comments to list of Comment objects.
        """
        rep_ids = set()
        for comment in self.rep_comments:
            if isinstance(comment, Comment):
                rep_ids.add(comment.id)
            else:
                rep_ids.add(comment["comment_id"])

        return [
            Comment(id=comment["id"], text=comment["comment"])
            for comment in self.doc_comments.to_dicts()
            if comment["id"] not in rep_ids
        ]

    def to_list(self):
        """
        Converts representative comments to list of Comment objects.
        """
        if not self.rep_comments:
            return []

        return [
            Comment(
                text=comment["comment_text"],
                num_represented=comment["comments_represented"],
                id=comment["comment_id"],
                form_letter=comment["form_letter"],
                representative=True,
            )
            for comment in self.rep_comments
        ]

get_nonrepresentative_comments()

Converts nonrepresentative comments to list of Comment objects.

Source code in civiclens/nlp/tools.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def get_nonrepresentative_comments(self):
    """
    Converts nonrepresentative comments to list of Comment objects.
    """
    rep_ids = set()
    for comment in self.rep_comments:
        if isinstance(comment, Comment):
            rep_ids.add(comment.id)
        else:
            rep_ids.add(comment["comment_id"])

    return [
        Comment(id=comment["id"], text=comment["comment"])
        for comment in self.doc_comments.to_dicts()
        if comment["id"] not in rep_ids
    ]

to_list()

Converts representative comments to list of Comment objects.

Source code in civiclens/nlp/tools.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def to_list(self):
    """
    Converts representative comments to list of Comment objects.
    """
    if not self.rep_comments:
        return []

    return [
        Comment(
            text=comment["comment_text"],
            num_represented=comment["comments_represented"],
            id=comment["comment_id"],
            form_letter=comment["form_letter"],
            representative=True,
        )
        for comment in self.rep_comments
    ]

sentiment_analysis(comment, pipeline)

Analyze sentiment of a comment.

Parameters:

Name Type Description Default
comment Comment

Comment object

required
pipeline pipeline

Hugging Face pipeline for conducting sentiment analysis

required

Returns:

Type Description
str

Sentiment label as string (e.g 'postive', 'negative', 'neutral')

Source code in civiclens/nlp/tools.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def sentiment_analysis(comment: Comment, pipeline: pipeline) -> str:
    """
    Analyze sentiment of a comment.

    Args:
        comment: Comment object
        pipeline: Hugging Face pipeline for conducting sentiment analysis

    Returns:
        Sentiment label as string (e.g 'postive', 'negative', 'neutral')
    """
    try:
        out = pipeline(comment.text)[0]
    except Exception as e:
        print(e)
        return ""

    out = pipeline(comment.text)[0]
    return out["label"]

topics

HDAModel

Peforms LDA topic modeling

Source code in civiclens/nlp/topics.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class HDAModel:
    """
    Peforms LDA topic modeling
    """

    def __init__(self):
        self.model = None
        self.tokenizer = partial(regex_tokenize, pattern=r"\W+")
        self.stop_words = stopwords(
            Path(__file__).resolve().parent / "saved_models/stop_words.pickle"
        )
        self.terms = None

    def _process_text(
        self, comments: list[Comment]
    ) -> tuple[list[list[str]], dict[int, str]]:
        """
        Clean text and convert to tokens
        """
        docs = []
        document_ids = {}
        for idx, comment in enumerate(comments):
            docs.append(self.tokenizer(clean_text(comment.text).lower()))
            document_ids[idx] = comment.id

        # remove numbers, 2 character tokens, and stop words
        docs = [
            [
                token
                for token in doc
                if not token.isnumeric()
                and len(token) > 2
                and token not in self.stop_words
            ]
            for doc in docs
        ]

        return docs, document_ids

    def _create_corpus(
        self, docs: list[list[str]]
    ) -> tuple[Dictionary, list[tuple]]:
        """
        Converts tokens to corpus and corresponding dictionary
        """
        bigram_generator = Phrases(docs, min_count=10).freeze()

        for doc in docs:
            for token in bigram_generator[doc]:
                if "_" in token:
                    doc.append(token)

        token_dict = corpora.Dictionary(docs)
        corpus = [token_dict.doc2bow(doc) for doc in docs]

        return token_dict, corpus

    def run_model(self, comments: list[Comment]):
        """
        Runs HDA topic analysis.
        """
        docs, document_id = self._process_text(comments)
        token_dict, corpus = self._create_corpus(docs)

        hdp_model = HdpModel(corpus, token_dict)
        numeric_topics = self._find_best_topic(hdp_model, corpus)

        comment_topics = {}
        topic_terms = {}
        for doc_id, topic in numeric_topics.items():
            comment_id = document_id[doc_id]
            if topic not in topic_terms:
                topic_terms[topic] = [
                    word for word, _ in hdp_model.show_topic(topic)
                ]
            comment_topics[comment_id] = topic

        self.terms = topic_terms

        return comment_topics

    def _find_best_topic(
        self, model: HdpModel, corpus: list[tuple]
    ) -> dict[int, int]:
        """
        Computes most probable topic per document
        """
        best_topic = {}
        for doc_id, doc in enumerate(corpus):
            max_prob = float("-inf")
            topic_id = -1
            for topic_num, prob in model[doc]:
                if prob > max_prob:
                    max_prob = prob
                    topic_id = topic_num
            best_topic[doc_id] = topic_id

        return best_topic

    def get_terms(self) -> dict:
        """
        Returns terms for a all topics
        """
        if not self.terms:
            return {}

        return self.terms

    def generate_search_vector(self) -> list[str]:
        """
        Creates array of topics to use in Django serach model.
        """
        if not self.terms:
            raise RuntimeError(
                "Must run topic model before generating search vector"
            )

        search_vector = set()
        for term_list in self.terms.values():
            search_vector.update(term_list)

        return list(search_vector)

generate_search_vector()

Creates array of topics to use in Django serach model.

Source code in civiclens/nlp/topics.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def generate_search_vector(self) -> list[str]:
    """
    Creates array of topics to use in Django serach model.
    """
    if not self.terms:
        raise RuntimeError(
            "Must run topic model before generating search vector"
        )

    search_vector = set()
    for term_list in self.terms.values():
        search_vector.update(term_list)

    return list(search_vector)

get_terms()

Returns terms for a all topics

Source code in civiclens/nlp/topics.py
131
132
133
134
135
136
137
138
def get_terms(self) -> dict:
    """
    Returns terms for a all topics
    """
    if not self.terms:
        return {}

    return self.terms

run_model(comments)

Runs HDA topic analysis.

Source code in civiclens/nlp/topics.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def run_model(self, comments: list[Comment]):
    """
    Runs HDA topic analysis.
    """
    docs, document_id = self._process_text(comments)
    token_dict, corpus = self._create_corpus(docs)

    hdp_model = HdpModel(corpus, token_dict)
    numeric_topics = self._find_best_topic(hdp_model, corpus)

    comment_topics = {}
    topic_terms = {}
    for doc_id, topic in numeric_topics.items():
        comment_id = document_id[doc_id]
        if topic not in topic_terms:
            topic_terms[topic] = [
                word for word, _ in hdp_model.show_topic(topic)
            ]
        comment_topics[comment_id] = topic

    self.terms = topic_terms

    return comment_topics

LabelChain

Source code in civiclens/nlp/topics.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
class LabelChain:
    def __init__(self):
        self.tokenizer = label_tokenizer
        self.model = label_model

    def generate_label(self, terms: list[str]) -> tuple:
        """
        Create better topic terms.
        """
        text = ", ".join(terms)

        inputs = self.tokenizer(
            [text], max_length=512, truncation=True, return_tensors="pt"
        )
        output = self.model.generate(
            **inputs, num_beams=8, do_sample=True, min_length=10, max_length=64
        )

        decoded_output = self.tokenizer.batch_decode(
            output, skip_special_tokens=True
        )[0]

        return tuple(set(decoded_output.strip().split(", ")))

generate_label(terms)

Create better topic terms.

Source code in civiclens/nlp/topics.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def generate_label(self, terms: list[str]) -> tuple:
    """
    Create better topic terms.
    """
    text = ", ".join(terms)

    inputs = self.tokenizer(
        [text], max_length=512, truncation=True, return_tensors="pt"
    )
    output = self.model.generate(
        **inputs, num_beams=8, do_sample=True, min_length=10, max_length=64
    )

    decoded_output = self.tokenizer.batch_decode(
        output, skip_special_tokens=True
    )[0]

    return tuple(set(decoded_output.strip().split(", ")))

create_topics(comments)

Condense topics for document summary

Parameters:

Name Type Description Default
Comments

list of Comment objects

required

Returns:

Type Description
dict

Dictionary of topics, and corresponding sentiment data

Source code in civiclens/nlp/topics.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
def create_topics(comments: list[Comment]) -> dict:
    """
    Condense topics for document summary

    Args:
        Comments: list of Comment objects

    Returns:
        Dictionary of topics, and corresponding sentiment data
    """
    temp = defaultdict(dict)

    for comment in comments:
        temp[comment.topic_label][comment.sentiment] = (
            temp[comment.topic_label].get(comment.sentiment, 0)
            + comment.num_represented
        )
        temp[comment.topic_label]["total"] = (
            temp[comment.topic_label].get("total", 0) + comment.num_represented
        )

    topics = []
    # create output dictionary
    for topic_label, part in temp.items():
        part["topic"] = topic_label
        topics.append(part)

    # sort topics by "total"
    return sorted(topics, key=lambda topic: topic["total"], reverse=True)

label_topics(topics, model)

Generates a label for all topics

Parameters:

Name Type Description Default
topics dict[int, list]

dictionary of topics, as lists of terms

required
model LabelChain

LLM model to generate labels

required

Returns:

Type Description
dict[int, str]

Dictionary of topics, and labels

Source code in civiclens/nlp/topics.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def label_topics(topics: dict[int, list], model: LabelChain) -> dict[int, str]:
    """
    Generates a label for all topics

    Args:
        topics: dictionary of topics, as lists of terms
        model: LLM model to generate labels

    Returns:
        Dictionary of topics, and labels
    """
    labels = {}
    for topic, terms in topics.items():
        labels[topic] = model.generate_label(terms)

    return labels

stopwords(model_path)

Loads in pickled set of stopword for text processing.

Parameters:

Name Type Description Default
model_path Path

path from downloaded model

required

Returns:

Type Description
set[str]

Set of stop words.

Source code in civiclens/nlp/topics.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def stopwords(model_path: Path) -> set[str]:
    """
    Loads in pickled set of stopword for text processing.

    Args:
        model_path: path from downloaded model

    Returns:
        Set of stop words.
    """
    with open(model_path, "rb") as f:
        stop_words = pickle.load(f)

    return stop_words

topic_comment_analysis(comment_data, model=None, labeler=None, sentiment_analyzer=None)

Run topic and sentiment analysis.

Parameters:

Name Type Description Default
comment_data RepComments

RepComment object

required
model HDAModel

instance topic model class

None
labeler LabelChain

chain for generating topic labels

None
sentiment_analyzer Callable

function to analyze comment text sentiment

None

Returns:

Type Description
RepComments

RepComment object with full topic anlayis complete

Source code in civiclens/nlp/topics.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def topic_comment_analysis(
    comment_data: RepComments,
    model: HDAModel = None,
    labeler: LabelChain = None,
    sentiment_analyzer: Callable = None,
) -> RepComments:
    """
    Run topic and sentiment analysis.

    Args:
        comment_data: RepComment object
        model: instance topic model class
        labeler: chain for generating topic labels
        sentiment_analyzer: function to analyze comment text sentiment

    Returns:
        RepComment object with full topic anlayis complete
    """
    comments: list[Comment] = []

    if comment_data.summary:
        comments += [
            Comment(text=comment_data.summary, id="Summary", source="Summary")
        ]

    comments += comment_data.to_list()
    if not comment_data.rep_comments:
        comment_data.representative = False
        comments += comment_data.get_nonrepresentative_comments()

    comment_topics = model.run_model(comments)
    topic_terms = model.get_terms()
    topic_labels = label_topics(topic_terms, labeler)

    # filter out non_rep comments
    rep_comments: list[Comment] = []

    for comment in comments:
        comment.topic_label = topic_labels[comment_topics[comment.id]]
        comment.topic = comment_topics[comment.id]
        comment.sentiment = sentiment_analyzer(comment)
        if comment.representative or not comment_data.representative:
            rep_comments.append(comment)

    rep_comments = sorted(
        rep_comments, key=lambda comment: comment.num_represented, reverse=True
    )

    return RepComments(
        document_id=comment_data.document_id,
        doc_comments=comment_data.doc_comments,
        rep_comments=[comment.to_dict() for comment in rep_comments],
        doc_plain_english_title=comment_data.doc_plain_english_title,
        num_total_comments=comment_data.num_total_comments,
        num_unique_comments=comment_data.num_unique_comments,
        num_representative_comment=comment_data.num_representative_comment,
        topics=create_topics(comments),
        search_vector=model.generate_search_vector(),
        representative=comment_data.representative,
    )

text

clean_text(text, patterns=None)

String cleaning function for comments.

Parameters:

Name Type Description Default
text str

comment text

required
patterns list[str]

optional list of regular expression patterns to pass in (eg. [(r'\w+', "-")])

None

Returns:

Type Description
str

Cleaned verison of text

Source code in civiclens/utils/text.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def clean_text(text: str, patterns: Optional[list[tuple]] = None) -> str:
    r"""
    String cleaning function for comments.

    Args:
        text (str): comment text
        patterns (list[str]): optional list of regular expression patterns
            to pass in (eg. [(r'\w+', "-")])

    Returns:
        Cleaned verison of text
    """
    if patterns is None:
        patterns = []

    text = re.sub(r"&#39;", "'", text)  # this replaces html entity with '
    text = re.sub(r"&rdquo;", '"', text)  # this replaces html entity with "
    text = re.sub(r"&amp;", "&", text)  # this replaces html entity with &
    text = re.sub(r"รข", "", text)
    text = re.sub(r"<br\s*/?>", "", text)

    text = re.sub(r"<\s*br\s*/>", " ", text)
    text = re.sub(r"[^a-zA-Z0-9.'\"\?\: -]", "", text)
    text = re.sub(r"\w*ndash\w*", "", text)

    if patterns:
        for pattern, replacement in patterns:
            text = re.sub(pattern, replacement, text)

    # remove extra whitespace
    return re.sub(r"\s+", " ", text).strip()

regex_tokenize(text, pattern='\\W+')

Splits strings into tokens base on regular expression.

Parameters:

Name Type Description Default
text str

string to tokenize

required
pattern str

regular expression to split tokens on, defaults to white space

'\\W+'

Returns:

Type Description

List of strings represented tokens

Source code in civiclens/utils/text.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
def regex_tokenize(text: str, pattern: str = r"\W+"):
    """
    Splits strings into tokens base on regular expression.

    Args:
        text: string to tokenize
        pattern: regular expression to split tokens on, defaults to white space

    Returns:
        List of strings represented tokens
    """
    return re.split(pattern, text)

sentence_splitter(text, sep='.')

Splits string into sentences.

Parameters:

Name Type Description Default
text str

string to process

required
sep str

value to seperate string on, defaults to '.'

'.'

Returns:

Type Description
list[str]

List of strings split on the seperator valur

Source code in civiclens/utils/text.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def sentence_splitter(text: str, sep: str = ".") -> list[str]:
    """
    Splits string into sentences.

    Args:
        text: string to process
        sep: value to seperate string on, defaults to '.'

    Returns:
        List of strings split on the seperator valur
    """
    # remove periods not at the end of sentences
    clean = re.sub(r"\.(?!\s)", " ", text)
    sentences = clean.split(sep)

    return [sentence.strip() + "." for sentence in sentences if sentence]

truncate(text, num_words)

Truncates commments:

Parameters:

Name Type Description Default
text str

Text of the comment

required
num_words int

Number of words to keep

required

Returns:

Type Description
str

Truncated commented

Source code in civiclens/utils/text.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def truncate(text: str, num_words: int) -> str:
    """
    Truncates commments:

    Args:
        text (str): Text of the comment
        num_words (int): Number of words to keep

    Returns:
        Truncated commented
    """
    words = text.split(" ")

    return " ".join(words[:num_words])

database_access

Database

Wrapper for CivicLens postrgres DB.

Source code in civiclens/utils/database_access.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class Database:
    """
    Wrapper for CivicLens postrgres DB.
    """

    def __init__(self):
        self.conn = psycopg2.connect(
            database=os.getenv("DATABASE"),
            user=os.getenv("DATABASE_USER"),
            password=os.getenv("DATABASE_PASSWORD"),
            host=os.getenv("DATABASE_HOST"),
            port=os.getenv("DATABASE_PORT"),
        )

    def cursor(self):
        return self.conn.cursor()

    def close(self):
        return self.conn.close()

    def commit(self):
        return self.conn.commit()

pull_data(connection, query, schema=None, return_type='df')

Takes a SQL Query and returns a polars dataframe

Parameters:

Name Type Description Default
query str

SQL Query

required
schema list[str]

list of column names for the dataframe

None
return_type str

"df" or "list"

'df'

Returns:

Type Description
DataFrame | List[Tuple]

Polars df of comment data or list of comment data

Source code in civiclens/utils/database_access.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def pull_data(
    connection: Database,
    query: str,
    schema: Optional[List[str]] = None,
    return_type: str = "df",
) -> pl.DataFrame | List[Tuple]:
    """Takes a SQL Query and returns a polars dataframe

    Args:
        query (str): SQL Query
        schema (list[str]): list of column names for the dataframe
        return_type (str): "df" or "list"

    Returns:
        Polars df of comment data or list of comment data
    """ """"""
    if return_type == "df" and not schema:
        raise ValueError("Must input schema to return df")

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    except (Exception, psycopg2.Error) as error:
        raise RuntimeError(
            f"Error while connecting to PostgreSQL: {str(error).strip()}"
        ) from error

    finally:
        # Close the connection and cursor to free resources
        if connection:
            cursor.close()
            connection.close()
            print("PostgreSQL connection is closed")

    if return_type == "df":
        results = pl.DataFrame(results, schema=schema)

    return results

upload_comments(connection, comments)

Uploads comment data to database.

Parameters:

Name Type Description Default
connection Database

Postgres client

required
comments RepComments

comments to be uploaded

required

Returns:

Type Description
None

None, uploads comments to database

Source code in civiclens/utils/database_access.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def upload_comments(connection: Database, comments: RepComments) -> None:
    """
    Uploads comment data to database.

    Args:
        connection: Postgres client
        comments: comments to be uploaded

    Returns:
        None, uploads comments to database
    """
    query = """
    INSERT INTO regulations_nlpoutput (
            comments,
            is_representative,
            doc_plain_english_title,
            num_total_comments,
            num_unique_comments,
            num_representative_comment,
            topics,
            num_topics,
            last_updated,
            created_at,
            search_topics,
            document_id)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (document_id)
        DO UPDATE SET
            comments = EXCLUDED.comments,
            is_representative = EXCLUDED.is_representative,
            doc_plain_english_title = EXCLUDED.doc_plain_english_title,
            num_total_comments = EXCLUDED.num_total_comments,
            num_unique_comments = EXCLUDED.num_unique_comments,
            num_representative_comment = EXCLUDED.num_representative_comment,
            topics = EXCLUDED.topics,
            num_topics = EXCLUDED.num_topics,
            last_updated = NOW(),
            search_topics = EXCLUDED.search_topics
        WHERE regulations_nlpoutput.last_updated IS NULL
            OR regulations_nlpoutput.last_updated < EXCLUDED.last_updated;
            """

    values = (
        json.dumps(comments.rep_comments),
        comments.representative,
        comments.doc_plain_english_title,
        comments.num_total_comments,
        comments.num_unique_comments,
        comments.num_representative_comment,
        json.dumps(comments.topics),
        len(comments.topics),
        datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
        comments.last_updated.strftime("%m/%d/%Y, %H:%M:%S"),
        ", ".join(comments.search_vector),
        comments.document_id,
    )

    try:
        cursor = connection.cursor()
        cursor.execute(query, values)
        connection.commit()

    except Exception as e:
        print(e)

    if connection:
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")